diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 4ffc473..d43564f 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,27 +1,46 @@ cmake_minimum_required (VERSION 2.8) project (flow) -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -msse4 -fopenmp") #-Wall -set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -msse4 -fopenmp") #-Wall +message(STATUS "Architecture: ${ARCH}") + +# On ghc machines +if (ARCH STREQUAL "x86") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -msse4 -fopenmp") #-Wall + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -msse4 -fopenmp") #-Wall + + # For ghc machines + set(OpenCV_DIR "/afs/cs/academic/class/15418-s17/public/sw/opencv/build") + set(Eigen3_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/build") + +# On NVIDIA Jetson +elseif (ARCH STREQUAL "ARM") + + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -std=c++11 -fopenmp") #-Wall + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -g -O3 -Wno-unknown-pragmas -Wall -fopenmp") #-Wall + +else() + message(STATUS "specify using -DARCH=x86 or -DARCH=ARM") + message(FATAL_ERROR "aborting: missing required architecture") +endif () -# For ghc machines -set(OpenCV_DIR "/afs/cs/academic/class/15418-s17/public/sw/opencv/build") -set(Eigen3_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/build") find_package(OpenCV REQUIRED) find_package(Eigen3 REQUIRED) find_package(CUDA REQUIRED) # On ghc, include DIR from FIND_PACKAGE is set wrong -set(EIGEN3_INCLUDE_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/include/eigen3") +if (ARCH STREQUAL "x86") + set(EIGEN3_INCLUDE_DIR "/afs/cs.cmu.edu/academic/class/15418-s17/public/sw/eigen/include/eigen3") +endif() include_directories(${EIGEN3_INCLUDE_DIR}) # Add extra cuda libraries set(CUDA_CUSOLVER_LIBRARIES - "/usr/local/depot/cuda-8.0/lib64/libcusolver.so" - "/usr/local/depot/cuda-8.0/lib64/libcublas.so.8.0") + "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcusolver.so" + "${CUDA_TOOLKIT_ROOT_DIR}/lib64/libcublas.so.8.0") +set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_CUSOLVER_LIBRARIES}) -set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_nppi_LIBRARY} ${CUDA_CUSOLVER_LIBRARIES}) +set(CUDA_LIBRARIES ${CUDA_LIBRARIES} ${CUDA_nppi_LIBRARY}) message(STATUS "OpenCV library status:") message(STATUS " version: ${OpenCV_VERSION}") @@ -34,6 +53,7 @@ message(STATUS " include path: ${EIGEN3_INCLUDE_DIR} ${EIGEN3_INCLUDE_DIRS}") message(STATUS "CUDA library status:") message(STATUS " version: ${CUDA_VERSION}") +message(STATUS " toolkit path: ${CUDA_TOOLKIT_ROOT_DIR}") message(STATUS " libraries: ${CUDA_LIBRARIES}") message(STATUS " include path: ${CUDA_INCLUDE_DIRS}") message(STATUS " toolkit path: ${CUDA_TOOLKIT_ROOT_DIR}") @@ -46,7 +66,14 @@ message(STATUS " cuSOLVER path: ${CUDA_CUSOLVER_LIBRARIES}") ################################################################################ set(CUDA_VERBOSE_BUILD ON) -set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -std=c++11 -arch=compute_61 -code=sm_61) + +if (ARCH STREQUAL "x86") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -arch=compute_61 -code=sm_61) +endif() + +if (ARCH STREQUAL "ARM") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -g -arch=compute_62 -code=sm_62) +endif() set(KERNELS kernels/densify.cu @@ -59,6 +86,7 @@ set(KERNELS kernels/resizeGrad.cpp kernels/resize.cpp) + set(COMMON common/RgbMat.cpp) @@ -70,20 +98,23 @@ set(CODEFILES refine_variational.cpp FDF1.0.1/image.c FDF1.0.1/opticalflow_aux.c - FDF1.0.1/solver.c) + FDF1.0.1/solver.c + ) -# RGB, Optical Flow -cuda_add_executable(flow ${COMMON} ${KERNELS} ${CODEFILES}) +# GrayScale, Optical Flow +cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS}) set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1") set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image +set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") # 8 wide SIMD (4 floats) +# set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD target_link_libraries(flow ${OpenCV_LIBS}) -# CUDA sandbox +# # CUDA sandbox # set(SANDBOX_FILES # # sandbox/process_sobel.cpp -# # sandbox/process_resize.cpp +# sandbox/process_resize.cpp # # sandbox/process_resizeGrad.cpp -# sandbox/process_pad.cpp +# # sandbox/process_pad.cpp # # sandbox/RgbMatTest.cpp # sandbox/sandbox.cpp) # cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) diff --git a/src/FDF1.0.1/image.c b/src/FDF1.0.1/image.c index 460e274..d8f044e 100644 --- a/src/FDF1.0.1/image.c +++ b/src/FDF1.0.1/image.c @@ -6,8 +6,16 @@ #include "image.h" -#include -typedef __v4sf v4sf; +// #include +// typedef __v4sf v4sf; + +#include + +#if (VECTOR_WIDTH == 4) +typedef float32x4_t v4sf; +#else +typedef float v4sf; +#endif /********** Create/Delete **********/ @@ -46,8 +54,12 @@ void image_erase(image_t *image){ void image_mul_scalar(image_t *image, const float scalar){ int i; v4sf* imp = (v4sf*) image->c1; +#if (VECTOR_WIDTH == 4) const v4sf scalarp = {scalar,scalar,scalar,scalar}; - for( i=0 ; istride/4*image->height ; i++){ +#else + const v4sf scalarp = scalar; +#endif + for( i=0 ; istride/VECTOR_WIDTH*image->height ; i++){ (*imp) *= scalarp; imp+=1; } @@ -73,7 +85,7 @@ color_image_t *color_image_new(const int width, const int height){ } image->width = width; image->height = height; - image->stride = ( (width+3) / 4 ) * 4; + image->stride = ( (width+VECTOR_WIDTH-1) / VECTOR_WIDTH ) * VECTOR_WIDTH; image->c1 = (float*) memalign(16, 3*image->stride*height*sizeof(float)); if(image->c1 == NULL){ fprintf(stderr, "Error: color_image_new() - not enough memory !\n"); @@ -374,7 +386,11 @@ convolution_t *convolution_new(const int order, const float *half_coeffs, const } static void convolve_vert_fast_3(image_t *dst, const image_t *src, const convolution_t *conv){ +#if (VECTOR_WIDTH == 4) const int iterline = (src->stride>>2)+1; +#else + const int iterline = (src->stride)+1; +#endif const float *coeff = conv->coeffs; //const float *coeff_accu = conv->coeffs_accu; v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1; @@ -399,7 +415,11 @@ static void convolve_vert_fast_3(image_t *dst, const image_t *src, const convolu } static void convolve_vert_fast_5(image_t *dst, const image_t *src, const convolution_t *conv){ +#if (VECTOR_WIDTH == 4) const int iterline = (src->stride>>2)+1; +#else + const int iterline = (src->stride)+1; +#endif const float *coeff = conv->coeffs; //const float *coeff_accu = conv->coeffs_accu; v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1; @@ -435,7 +455,11 @@ static void convolve_vert_fast_5(image_t *dst, const image_t *src, const convolu static void convolve_horiz_fast_3(image_t *dst, const image_t *src, const convolution_t *conv){ const int stride_minus_1 = src->stride-1; +#if (VECTOR_WIDTH == 4) const int iterline = (src->stride>>2); +#else + const int iterline = (src->stride); +#endif const float *coeff = conv->coeffs; v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1; // create shifted version of src @@ -466,7 +490,11 @@ static void convolve_horiz_fast_3(image_t *dst, const image_t *src, const convol static void convolve_horiz_fast_5(image_t *dst, const image_t *src, const convolution_t *conv){ const int stride_minus_1 = src->stride-1; const int stride_minus_2 = src->stride-2; +#if (VECTOR_WIDTH == 4) const int iterline = (src->stride>>2); +#else + const int iterline = (src->stride); +#endif const float *coeff = conv->coeffs; v4sf *srcp = (v4sf*) src->c1, *dstp = (v4sf*) dst->c1; float *src_p1 = (float*) malloc(sizeof(float)*src->stride*4); @@ -506,7 +534,8 @@ void convolve_horiz(image_t *dest, const image_t *src, const convolution_t *conv if(conv->order==1){ convolve_horiz_fast_3(dest,src,conv); return; - }else if(conv->order==2){ + } + else if(conv->order==2){ convolve_horiz_fast_5(dest,src,conv); return; } @@ -556,7 +585,8 @@ void convolve_vert(image_t *dest, const image_t *src, const convolution_t *conv) if(conv->order==1){ convolve_vert_fast_3(dest,src,conv); return; - }else if(conv->order==2){ + } + else if(conv->order==2){ convolve_vert_fast_5(dest,src,conv); return; } diff --git a/src/FDF1.0.1/opticalflow_aux.c b/src/FDF1.0.1/opticalflow_aux.c index 957a946..3076063 100644 --- a/src/FDF1.0.1/opticalflow_aux.c +++ b/src/FDF1.0.1/opticalflow_aux.c @@ -4,8 +4,13 @@ #include #include "opticalflow_aux.h" -#include -typedef __v4sf v4sf; +#include + +#if (VECTOR_WIDTH == 4) +typedef float32x4_t v4sf; +#else +typedef float v4sf; +#endif #define datanorm 0.1f*0.1f//0.01f // square of the normalization factor #define epsilon_color (0.001f*0.001f)//0.000001f @@ -75,9 +80,13 @@ void get_derivatives(const color_image_t *im1, const color_image_t *im2, const c #if (SELECTCHANNEL==1 | SELECTCHANNEL==2) image_t *tmp_im2 = image_new(im2->width,im2->height); v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1; +#if (VECTOR_WIDTH == 4) const v4sf half = {0.5f,0.5f,0.5f,0.5f}; +#else + const v4sf half = 0.5f; +#endif int i=0; - for(i=0 ; iheight*im1->stride/4 ; i++){ + for(i=0 ; iheight*im1->stride/VECTOR_WIDTH ; i++){ *tmp_im2p = half * ( (*im2p) + (*im1p) ); *dtp = (*im2p)-(*im1p); dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1; @@ -95,9 +104,13 @@ void get_derivatives(const color_image_t *im1, const color_image_t *im2, const c #else color_image_t *tmp_im2 = color_image_new(im2->width,im2->height); v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1; +#if (VECTOR_WIDTH == 4) const v4sf half = {0.5f,0.5f,0.5f,0.5f}; +#else + const v4sf half = 0.5f; +#endif int i=0; - for(i=0 ; i<3*im1->height*im1->stride/4 ; i++){ + for(i=0 ; i<3*im1->height*im1->stride/VECTOR_WIDTH; i++){ *tmp_im2p = half * ( (*im2p) + (*im1p) ); *dtp = (*im2p)-(*im1p); dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1; @@ -131,10 +144,21 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu convolve_vert(vy, vv, deriv_flow); // compute smoothness v4sf *uxp = (v4sf*) ux->c1, *vxp = (v4sf*) vx->c1, *uyp = (v4sf*) uy->c1, *vyp = (v4sf*) vy->c1, *sp = (v4sf*) smoothness->c1; +#if (VECTOR_WIDTH == 4) const v4sf qa = {quarter_alpha,quarter_alpha,quarter_alpha,quarter_alpha}; const v4sf epsmooth = {epsilon_smooth,epsilon_smooth,epsilon_smooth,epsilon_smooth}; - for(j=0 ; j< height*stride/4 ; j++){ - *sp = qa / __builtin_ia32_sqrtps( (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth ); +#else + const v4sf qa = quarter_alpha; + const v4sf epsmooth = epsilon_smooth; +#endif + for(j=0 ; j< height*stride/VECTOR_WIDTH ; j++){ +#if (VECTOR_WIDTH == 4) + *sp = qa / vsqrtq_f32( + (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth ); +#else + *sp = qa / sqrtf( + (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth ); +#endif sp+=1;uxp+=1; uyp+=1; vxp+=1; vyp+=1; } image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); @@ -147,7 +171,7 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu memcpy(sp_shift, spf+1, sizeof(float)*(stride-1)); v4sf *sps = (v4sf*) sp_shift; int i; - for(i=0;ic1, *sp_bottom = (v4sf*) (smoothness->c1+stride); sp = (v4sf*) smoothness->c1; - for(j=0 ; j<(height-1)*stride/4 ; j++){ + for(j=0 ; j<(height-1)*stride/VECTOR_WIDTH ; j++){ *dstvp = (*sp) + (*sp_bottom); dstvp+=1; sp+=1; sp_bottom+=1; } @@ -190,7 +214,7 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz } v4sf *wvp = (v4sf*) weight_vert->c1, *srcp = (v4sf*) src->c1, *srcp_s = (v4sf*) (src->c1+src->stride), *dstp = (v4sf*) dst->c1, *dstp_s = (v4sf*) (dst->c1+src->stride); - for(j=1+(src->height-1)*src->stride/4 ; --j ;){ + for(j=1+(src->height-1)*src->stride/VECTOR_WIDTH ; --j ;){ const v4sf tmp = (*wvp) * ((*srcp_s)-(*srcp)); *dstp += tmp; *dstp_s -= tmp; @@ -198,111 +222,6 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz } } -/* compute the dataterm and the matching term - a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side - other (color) images are input */ -void compute_data_and_match(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, image_t *desc_weight, image_t *desc_flow_x, image_t *desc_flow_y, const float half_delta_over3, const float half_beta, const float half_gamma_over3){ - - const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm}; - const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3}; - const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color}; - const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3}; - const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad}; - const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta}; - const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc}; - - v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1, - *maskp = (v4sf*) mask->c1, - *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, - *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, - *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, - *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, - *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, - *uup = (v4sf*) uu->c1, *vvp = (v4sf*)vv->c1, *wxp = (v4sf*)wx->c1, *wyp = (v4sf*)wy->c1, - *descflowxp = (v4sf*)desc_flow_x->c1, *descflowyp = (v4sf*)desc_flow_y->c1, *descweightp = (v4sf*)desc_weight->c1; - - memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride); - memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride); - - int i; - for(i = 0 ; iheight*uu->stride/4 ; i++){ - v4sf tmp, tmp2, tmp3, tmp4, tmp5, tmp6, n1, n2, n3, n4, n5, n6; - // dpsi color - if(half_delta_over3){ - tmp = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp); - n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm; - tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp); - n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm; - tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp); - n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm; - tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); - tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; - *a11p += tmp * (*ix1p) * (*ix1p); - *a12p += tmp * (*ix1p) * (*iy1p); - *a22p += tmp * (*iy1p) * (*iy1p); - *b1p -= tmp * (*iz1p) * (*ix1p); - *b2p -= tmp * (*iz1p) * (*iy1p); - *a11p += tmp2 * (*ix2p) * (*ix2p); - *a12p += tmp2 * (*ix2p) * (*iy2p); - *a22p += tmp2 * (*iy2p) * (*iy2p); - *b1p -= tmp2 * (*iz2p) * (*ix2p); - *b2p -= tmp2 * (*iz2p) * (*iy2p); - *a11p += tmp3 * (*ix3p) * (*ix3p); - *a12p += tmp3 * (*ix3p) * (*iy3p); - *a22p += tmp3 * (*iy3p) * (*iy3p); - *b1p -= tmp3 * (*iz3p) * (*ix3p); - *b2p -= tmp3 * (*iz3p) * (*iy3p); - } - // dpsi gradient - n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm; - n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm; - tmp = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp); - tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp); - n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm; - n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm; - tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp); - tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp); - n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm; - n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm; - tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp); - tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp); - tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); - tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; - *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p); - *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p); - *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p); - *b1p -= tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p); - *b2p -= tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p); - *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p); - *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p); - *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p); - *b1p -= tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p); - *b2p -= tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p); - *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p); - *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p); - *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p); - *b1p -= tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p); - *b2p -= tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p); - if(half_beta){ // dpsi_match - tmp = *uup - (*descflowxp); - tmp2 = *vvp - (*descflowyp); - tmp = hbeta*(*descweightp)/__builtin_ia32_sqrtps(tmp*tmp+tmp2*tmp2+epsdesc); - *a11p += tmp; - *a22p += tmp; - *b1p -= tmp*((*wxp)-(*descflowxp)); - *b2p -= tmp*((*wyp)-(*descflowyp)); - } - dup+=1; dvp+=1; maskp+=1; a11p+=1; a12p+=1; a22p+=1; b1p+=1; b2p+=1; - ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1; - ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1; - ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1; - uup+=1;vvp+=1;wxp+=1; wyp+=1;descflowxp+=1;descflowyp+=1;descweightp+=1; - } -} - /* compute the dataterm // REMOVED MATCHING TERM a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side other (color) images are input */ @@ -312,6 +231,7 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) #endif { +#if (VECTOR_WIDTH == 4) const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm}; const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3}; const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color}; @@ -319,6 +239,13 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad}; //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta}; //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc}; +#else + const v4sf dnorm = datanorm; + const v4sf hdover3 = half_delta_over3; + const v4sf epscolor = epsilon_color; + const v4sf hgover3 = half_gamma_over3; + const v4sf epsgrad = epsilon_grad; +#endif v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1, *maskp = (v4sf*) mask->c1, @@ -339,7 +266,7 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride); int i; - for(i = 0 ; iheight*uu->stride/4 ; i++){ + for(i = 0 ; iheight*uu->stride/VECTOR_WIDTH ; i++){ v4sf tmp, tmp2, n1, n2; #if (SELECTCHANNEL==3) v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6; @@ -353,10 +280,18 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm; tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp); n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm; - tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); +#if (VECTOR_WIDTH == 4) + tmp = (*maskp) * hdover3 / vsqrtq_f32(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); +#else + tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); +#endif tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; #else - tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(3 * tmp*tmp/n1 + epscolor); +#if (VECTOR_WIDTH == 4) + tmp = (*maskp) * hdover3 / vsqrtq_f32(3 * tmp*tmp/n1 + epscolor); +#else + tmp = (*maskp) * hdover3 / sqrtf(3 * tmp*tmp/n1 + epscolor); +#endif tmp /= n1; #endif *a11p += tmp * (*ix1p) * (*ix1p); @@ -392,10 +327,20 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm; tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp); tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp); - tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); +#if (VECTOR_WIDTH == 4) + tmp = (*maskp) * hgover3 / vsqrtq_f32( + tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); +#else + tmp = (*maskp) * hgover3 / sqrtf( + tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); +#endif tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; #else - tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); +#if (VECTOR_WIDTH == 4) + tmp = (*maskp) * hgover3 / vsqrtq_f32(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); +#else + tmp = (*maskp) * hgover3 / sqrtf(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); +#endif tmp2 = tmp/n2; tmp /= n1; #endif *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p); @@ -437,193 +382,3 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t } } - - -/* compute the dataterm // REMOVED MATCHING TERM - a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side - other (color) images are input */ -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) -#else -void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) -#endif -{ - const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm}; - const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3}; - const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color}; - const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3}; - const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad}; - //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta}; - //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc}; - - v4sf *dup = (v4sf*) du->c1, - *maskp = (v4sf*) mask->c1, - *a11p = (v4sf*) a11->c1, - *b1p = (v4sf*) b1->c1, - *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, - #if (SELECTCHANNEL==3) - *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, - *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, - #endif - *uup = (v4sf*) uu->c1, *wxp = (v4sf*)wx->c1; - - - memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride); - - int i; - for(i = 0 ; iheight*uu->stride/4 ; i++){ - v4sf tmp, tmp2, n1, n2; - #if (SELECTCHANNEL==3) - v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6; - #endif - // dpsi color - if(half_delta_over3){ - tmp = *iz1p + (*ix1p)*(*dup); - n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm; - #if (SELECTCHANNEL==3) - tmp2 = *iz2p + (*ix2p)*(*dup); - n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm; - tmp3 = *iz3p + (*ix3p)*(*dup); - n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm; - tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); - tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; - #else - tmp = (*maskp) * hdover3 / __builtin_ia32_sqrtps(3 * tmp*tmp/n1 + epscolor); - tmp /= n1; - #endif - *a11p += tmp * (*ix1p) * (*ix1p); - *b1p -= tmp * (*iz1p) * (*ix1p); - #if (SELECTCHANNEL==3) - *a11p += tmp2 * (*ix2p) * (*ix2p); - *b1p -= tmp2 * (*iz2p) * (*ix2p); - *a11p += tmp3 * (*ix3p) * (*ix3p); - *b1p -= tmp3 * (*iz3p) * (*ix3p); - #endif - } - // dpsi gradient - n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm; - n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm; - tmp = *ixz1p + (*ixx1p) * (*dup); - tmp2 = *iyz1p + (*ixy1p) * (*dup); - #if (SELECTCHANNEL==3) - n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm; - n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm; - tmp3 = *ixz2p + (*ixx2p) * (*dup); - tmp4 = *iyz2p + (*ixy2p) * (*dup); - n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm; - n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm; - tmp5 = *ixz3p + (*ixx3p) * (*dup); - tmp6 = *iyz3p + (*ixy3p) * (*dup); - tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); - tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; - #else - tmp = (*maskp) * hgover3 / __builtin_ia32_sqrtps(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); - tmp2 = tmp/n2; tmp /= n1; - #endif - *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p); - *b1p -= tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p); - #if (SELECTCHANNEL==3) - *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p); - *b1p -= tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p); - *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p); - *b1p -= tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p); - #endif - - - #if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // multiply system to make smoothing parameters same for RGB and single-channel image - *a11p *= 3; - *b1p *= 3; - #endif - - dup+=1; maskp+=1; a11p+=1; b1p+=1; - ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1; - #if (SELECTCHANNEL==3) - ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1; - ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1; - #endif - uup+=1; wxp+=1; - - } -} - - - -/* resize the descriptors to the new size using a weighted mean */ -void descflow_resize(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight){ - const int src_width = src_flow_x->width, src_height = src_flow_x->height, src_stride = src_flow_x->stride, - dst_width = dst_flow_x->width, dst_height = dst_flow_x->height, dst_stride = dst_flow_x->stride; - const float scale_x = ((float)dst_width-1)/((float)src_width-1), scale_y = ((float)dst_height-1)/((float)src_height-1); - image_erase(dst_flow_x); image_erase(dst_flow_y); image_erase(dst_weight); - int j; - for( j=0 ; jc1[j*src_stride+i]; - if( weight<0.0000000001f ) continue; - const float xx = ((float)i)*scale_x; - const float xxf = floor(xx); - const float dx = xx-xxf; - const int x1 = MINMAX_TA( (int) xxf , dst_width); - const int x2 = MINMAX_TA( (int) xxf+1 , dst_width); - float weightxy, newweight; - if( dx ){ - if( dy ){ - weightxy = weight*dx*dy; - newweight = dst_weight->c1[y2*dst_stride+x2] + weightxy; - dst_flow_x->c1[y2*dst_stride+x2] = (dst_flow_x->c1[y2*dst_stride+x2]*dst_weight->c1[y2*dst_stride+x2] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight; - dst_flow_y->c1[y2*dst_stride+x2] = (dst_flow_y->c1[y2*dst_stride+x2]*dst_weight->c1[y2*dst_stride+x2] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight; - dst_weight->c1[y2*dst_stride+x2] = newweight; - } - weightxy = weight*dx*(1.0f-dy); - newweight = dst_weight->c1[y1*dst_stride+x2] + weightxy; - dst_flow_x->c1[y1*dst_stride+x2] = (dst_flow_x->c1[y1*dst_stride+x2]*dst_weight->c1[y1*dst_stride+x2] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight; - dst_flow_y->c1[y1*dst_stride+x2] = (dst_flow_y->c1[y1*dst_stride+x2]*dst_weight->c1[y1*dst_stride+x2] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight; - dst_weight->c1[y1*dst_stride+x2] = newweight; - } - if( dy ) { - weightxy = weight*(1.0f-dx)*dy; - newweight = dst_weight->c1[y2*dst_stride+x1] + weightxy; - dst_flow_x->c1[y2*dst_stride+x1] = (dst_flow_x->c1[y2*dst_stride+x1]*dst_weight->c1[y2*dst_stride+x1] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight; - dst_flow_y->c1[y2*dst_stride+x1] = (dst_flow_y->c1[y2*dst_stride+x1]*dst_weight->c1[y2*dst_stride+x1] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight; - dst_weight->c1[y2*dst_stride+x1] = newweight; - } - weightxy = weight*(1.0f-dx)*(1.0f-dy); - newweight = dst_weight->c1[y1*dst_stride+x1] + weightxy; - dst_flow_x->c1[y1*dst_stride+x1] = (dst_flow_x->c1[y1*dst_stride+x1]*dst_weight->c1[y1*dst_stride+x1] + src_flow_x->c1[j*src_stride+i]*weightxy*scale_x)/newweight; - dst_flow_y->c1[y1*dst_stride+x1] = (dst_flow_y->c1[y1*dst_stride+x1]*dst_weight->c1[y1*dst_stride+x1] + src_flow_y->c1[j*src_stride+i]*weightxy*scale_y)/newweight; - dst_weight->c1[y1*dst_stride+x1] = newweight; - } - } -} - -/* resize the descriptors to the new size using a nearest neighbor method while keeping the descriptor with the higher weight at the end */ -void descflow_resize_nn(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight){ - const int src_width = src_flow_x->width, src_height = src_flow_x->height, src_stride = src_flow_x->stride, - dst_width = dst_flow_x->width, dst_height = dst_flow_x->height, dst_stride = dst_flow_x->stride; - const float scale_x = ((float)dst_width-1)/((float)src_width-1), scale_y = ((float)dst_height-1)/((float)src_height-1); - image_erase(dst_flow_x); image_erase(dst_flow_y); image_erase(dst_weight); - int j; - for( j=0 ; jc1[j*src_stride+i]; - if( !weight ) - continue; - const float xx = ((float)i)*scale_x; - const int x = (int) 0.5f+xx; // equivalent to round(xx) - if( dst_weight->c1[y*dst_stride+x] < weight ){ - dst_weight->c1[y*dst_stride+x] = weight; - dst_flow_x->c1[y*dst_stride+x] = src_flow_x->c1[j*src_stride+i]*scale_x; - dst_flow_y->c1[y*dst_stride+x] = src_flow_y->c1[j*src_stride+i]*scale_y; - } - } - } -} diff --git a/src/FDF1.0.1/opticalflow_aux.h b/src/FDF1.0.1/opticalflow_aux.h index 063ac23..ebca1f1 100644 --- a/src/FDF1.0.1/opticalflow_aux.h +++ b/src/FDF1.0.1/opticalflow_aux.h @@ -33,11 +33,6 @@ void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu /* sub the laplacian (smoothness term) to the right-hand term */ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert); -/* compute the dataterm and the matching term - a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side - other (color) images are input */ -void compute_data_and_match(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, image_t *desc_weight, image_t *desc_flow_x, image_t *desc_flow_y, const float half_delta_over3, const float half_beta, const float half_gamma_over3); - /* compute the dataterm and ... REMOVED THE MATCHING TERM a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side other (color) images are input */ @@ -47,18 +42,6 @@ void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); #endif -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); -#else -void compute_data_DE(image_t *a11, image_t *b1, image_t *mask, image_t *wx, image_t *du, image_t *uu, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); -#endif - - -/* resize the descriptors to the new size using a weighted mean */ -void descflow_resize(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight); - -/* resize the descriptors to the new size using a nearest neighbor method while keeping the descriptor with the higher weight at the end */ -void descflow_resize_nn(image_t *dst_flow_x, image_t *dst_flow_y, image_t *dst_weight, const image_t *src_flow_x, const image_t *src_flow_y, const image_t *src_weight); #ifdef __cplusplus } diff --git a/src/FDF1.0.1/solver.c b/src/FDF1.0.1/solver.c index a4402e9..162b3a0 100644 --- a/src/FDF1.0.1/solver.c +++ b/src/FDF1.0.1/solver.c @@ -9,8 +9,12 @@ #include "image.h" #include "solver.h" -#include -typedef __v4sf v4sf; +// #include +// #if (VECTOR_WIDTH == 4) +// typedef float32x4_t v4sf; +// #else +// typedef float v4sf; +// #endif //THIS IS A SLOW VERSION BUT READABLE //Perform n iterations of the sor_coupled algorithm @@ -74,396 +78,349 @@ void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image // THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION // the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal // each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line -void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){ - //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n"); - - if(du->width<2 || du->height<2 || iterations < 1){ - sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); - return; - } - - const int stride = du->stride, width = du->width; - const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1); - int j,iter,i,k; - float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); - if(floatarray==NULL){ - fprintf(stderr, "error in sor_coupled(): not enough memory\n"); - exit(1); - } - float *f1 = floatarray; - float *f2 = f1+stride; - float *f3 = f2+stride; - f1[0] = 0.0f; - memset(&f1[width], 0, sizeof(float)*(stride-width)); - memset(&f2[width-1], 0, sizeof(float)*(stride-width+1)); - memset(&f3[width-1], 0, sizeof(float)*(stride-width+1)); - - { // first iteration - v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; - float *du_ptr = du->c1, *dv_ptr = dv->c1; - v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); - - { // first iteration - first line - - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vp); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - for(i=iterline;--i;){ - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vp); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); - for(k=0;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - - v4sf *vpt = (v4sf*) dpsis_vert->c1; - v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; - - for(j=iterheight;--j;){ // first iteration - middle lines - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - for(i=iterline;--i;){ - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); - for(k=0;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - - { // first iteration - last line - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vpt); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - for(i=iterline;--i;){ - // reverse 2x2 diagonal block - const v4sf dpsis = (*hpl) + (*hp) + (*vpt); - const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; - const v4sf det = A11*A22 - (*a12p)*(*a12p); - *a11p = A11/det; - *a22p = A22/det; - *a12p /= -det; - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); - for(k=0;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - } - - - - for(iter=iterations;--iter;) // other iterations - { - v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; - float *du_ptr = du->c1, *dv_ptr = dv->c1; - v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); - - { // other iteration - first line - - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - for(i=iterline;--i;){ - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); - for(k=0;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - - v4sf *vpt = (v4sf*) dpsis_vert->c1; - v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; - - for(j=iterheight;--j;) // other iteration - middle lines - { - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++) - { - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - for(i=iterline; --i;) - { - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); - for(k=0;k<4;k++) - { - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - - { // other iteration - last line - memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); - memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); - memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); - v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; - - { // left block - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); - du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); - dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); - for(k=1;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - for(i=iterline;--i;){ - // do one iteration - const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); - const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); - for(k=0;k<4;k++){ - const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; - const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; - du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); - dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); - } - // increment pointer - hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; - dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; - du_ptr += 4; dv_ptr += 4; - } - - } - } - - - - free(floatarray); - -} - - -//THIS IS A SLOW VERSION BUT READABLE -//Perform n iterations of the sor_coupled algorithm -//du is used as initial guesses -//The system form is the same as in opticalflow.c -void sor_coupled_slow_but_readable_DE(image_t *du, const image_t *a11, const image_t *b1, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega) -{ - int i,j,iter; - for(iter = 0 ; iterheight ; j++) - { - float sigma_u,sum_dpsis,A11,B1; - for(i=0 ; iwidth ; i++){ - sigma_u = 0.0f; - sum_dpsis = 0.0f; - if(j>0) - { - sigma_u -= dpsis_vert->c1[(j-1)*du->stride+i]*du->c1[(j-1)*du->stride+i]; - sum_dpsis += dpsis_vert->c1[(j-1)*du->stride+i]; - } - if(i>0) - { - sigma_u -= dpsis_horiz->c1[j*du->stride+i-1]*du->c1[j*du->stride+i-1]; - sum_dpsis += dpsis_horiz->c1[j*du->stride+i-1]; - } - if(jheight-1) - { - sigma_u -= dpsis_vert->c1[j*du->stride+i]*du->c1[(j+1)*du->stride+i]; - sum_dpsis += dpsis_vert->c1[j*du->stride+i]; - } - if(iwidth-1) - { - sigma_u -= dpsis_horiz->c1[j*du->stride+i]*du->c1[j*du->stride+i+1]; - sum_dpsis += dpsis_horiz->c1[j*du->stride+i]; - } - A11 = a11->c1[j*du->stride+i]+sum_dpsis; - B1 = b1->c1[j*du->stride+i]-sigma_u; - du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] +omega*( B1/A11 ); - } - } - } -} - - +// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){ +// //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n"); +// +// if(du->width<2 || du->height<2 || iterations < 1){ +// sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); +// return; +// } +// +// const int stride = du->stride, width = du->width; +// const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1); +// int j,iter,i,k; +// float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); +// if(floatarray==NULL){ +// fprintf(stderr, "error in sor_coupled(): not enough memory\n"); +// exit(1); +// } +// float *f1 = floatarray; +// float *f2 = f1+stride; +// float *f3 = f2+stride; +// f1[0] = 0.0f; +// memset(&f1[width], 0, sizeof(float)*(stride-width)); +// memset(&f2[width-1], 0, sizeof(float)*(stride-width+1)); +// memset(&f3[width-1], 0, sizeof(float)*(stride-width+1)); +// +// { // first iteration +// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; +// float *du_ptr = du->c1, *dv_ptr = dv->c1; +// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); +// +// { // first iteration - first line +// +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// v4sf *vpt = (v4sf*) dpsis_vert->c1; +// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; +// +// for(j=iterheight;--j;){ // first iteration - middle lines +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// { // first iteration - last line +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// } +// +// +// +// for(iter=iterations;--iter;) // other iterations +// { +// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; +// float *du_ptr = du->c1, *dv_ptr = dv->c1; +// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); +// +// { // other iteration - first line +// +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// v4sf *vpt = (v4sf*) dpsis_vert->c1; +// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; +// +// for(j=iterheight;--j;) // other iteration - middle lines +// { +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++) +// { +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// for(i=iterline; --i;) +// { +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++) +// { +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// { // other iteration - last line +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// } +// +// +// +// free(floatarray); +// +// } diff --git a/src/FDF1.0.1/solver.h b/src/FDF1.0.1/solver.h index 6064349..ee75e7a 100644 --- a/src/FDF1.0.1/solver.h +++ b/src/FDF1.0.1/solver.h @@ -12,8 +12,6 @@ void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t * void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega); -void sor_coupled_slow_but_readable_DE(image_t *du, const image_t *a11, const image_t *b1, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega); - #ifdef __cplusplus } #endif diff --git a/src/oflow.cpp b/src/oflow.cpp index d38e709..f8f67ae 100644 --- a/src/oflow.cpp +++ b/src/oflow.cpp @@ -47,9 +47,9 @@ namespace OFC { op.n_vals = 3 * pow(op.patch_size, 2); op.n_scales = op.coarsest_scale - op.finest_scale + 1; float norm_outlier2 = pow(op.norm_outlier, 2); - op.norm_outlier_tmpbsq = (v4sf) {norm_outlier2, norm_outlier2, norm_outlier2, norm_outlier2}; - op.norm_outlier_tmp2bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.twos); - op.norm_outlier_tmp4bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.fours); + // op.norm_outlier_tmpbsq = (v4sf) {norm_outlier2, norm_outlier2, norm_outlier2, norm_outlier2}; + // op.norm_outlier_tmp2bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.twos); + // op.norm_outlier_tmp4bsq = __builtin_ia32_mulps(op.norm_outlier_tmpbsq, op.fours); op.dp_thresh = 0.05 * 0.05; op.dr_thresh = 0.95; op.res_thresh = 0.0; diff --git a/src/params.h b/src/params.h index bee846c..a1bae87 100644 --- a/src/params.h +++ b/src/params.h @@ -5,8 +5,6 @@ namespace OFC { - typedef __v4sf v4sf; - typedef struct { int width; // image width, does not include '2 * padding', but includes original padding to ensure integer divisible image width and height @@ -55,15 +53,15 @@ namespace OFC { cublasHandle_t cublasHandle; // Helper variables - v4sf zero = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f}; - v4sf negzero = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f}; - v4sf half = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f}; - v4sf ones = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f}; - v4sf twos = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f}; - v4sf fours = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f}; - v4sf norm_outlier_tmpbsq; - v4sf norm_outlier_tmp2bsq; - v4sf norm_outlier_tmp4bsq; + // v4sf zero = (v4sf) {0.0f, 0.0f, 0.0f, 0.0f}; + // v4sf negzero = (v4sf) {-0.0f, -0.0f, -0.0f, -0.0f}; + // v4sf half = (v4sf) {0.5f, 0.5f, 0.5f, 0.5f}; + // v4sf ones = (v4sf) {1.0f, 1.0f, 1.0f, 1.0f}; + // v4sf twos = (v4sf) {2.0f, 2.0f, 2.0f, 2.0f}; + // v4sf fours = (v4sf) {4.0f, 4.0f, 4.0f, 4.0f}; + // v4sf norm_outlier_tmpbsq; + // v4sf norm_outlier_tmp2bsq; + // v4sf norm_outlier_tmp4bsq; } opt_params; } diff --git a/src/patch.cpp b/src/patch.cpp index f48da59..dbffe0d 100644 --- a/src/patch.cpp +++ b/src/patch.cpp @@ -28,7 +28,6 @@ using std::vector; namespace OFC { - typedef __v4sf v4sf; PatClass::PatClass( const img_params* _i_params, diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp index 8086abb..c40f25d 100644 --- a/src/refine_variational.cpp +++ b/src/refine_variational.cpp @@ -10,6 +10,7 @@ #include #include +#include #include "refine_variational.h" @@ -144,7 +145,6 @@ namespace OFC { *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives // warp second image - image_warp(w_im2, mask, im2, wx, wy); // compute derivatives get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz); @@ -165,16 +165,16 @@ namespace OFC { sub_laplacian(b2, wy, smooth_horiz, smooth_vert); // solve system -#ifdef WITH_OPENMP +// #ifdef WITH_OPENMP sor_coupled_slow_but_readable(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // slower but parallelized -#else - sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); -#endif +// #else +// sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); +// #endif // update flow plus flow increment int i; v4sf *uup = (v4sf*) uu->c1, *vvp = (v4sf*) vv->c1, *wxp = (v4sf*) wx->c1, *wyp = (v4sf*) wy->c1, *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1; - for( i=0 ; iwidth; - int height = wx->height; - int stride = wx->stride; - - image_t *du = image_new(width,height), *wy_dummy = image_new(width,height), // the flow increment - *mask = image_new(width,height), // mask containing 0 if a point goes outside image boundary, 1 otherwise - *smooth_horiz = image_new(width,height), *smooth_vert = image_new(width,height), // horiz: (i,j) contains the diffusivity coeff. from (i,j) to (i+1,j) - *uu = image_new(width,height), // flow plus flow increment - *a11 = image_new(width,height), // system matrix A of Ax=b for each pixel - *b1 = image_new(width,height); // system matrix b of Ax=b for each pixel - - image_erase(wy_dummy); - - color_image_t *w_im2 = color_image_new(width,height), // warped second image - *Ix = color_image_new(width,height), *Iy = color_image_new(width,height), *Iz = color_image_new(width,height), // first order derivatives - *Ixx = color_image_new(width,height), *Ixy = color_image_new(width,height), *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives - - // warp second image - image_warp(w_im2, mask, im2, wx, wy_dummy); - // compute derivatives - get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz); - // erase du and dv - image_erase(du); - - // initialize uu and vv - memcpy(uu->c1,wx->c1,wx->stride*wx->height*sizeof(float)); - - // inner fixed point iterations - for(i_inner_iteration = 0 ; i_inner_iteration < vr.inner_iter ; i_inner_iteration++) { - - // compute robust function and system - compute_smoothness(smooth_horiz, smooth_vert, uu, wy_dummy, deriv_flow, vr.tmp_quarter_alpha ); - compute_data_DE(a11, b1, mask, wx, du, uu, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); - sub_laplacian(b1, wx, smooth_horiz, smooth_vert); - - // solve system - sor_coupled_slow_but_readable_DE(du, a11, b1, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); - - // update flow plus flow increment - int i; - v4sf *uup = (v4sf*) uu->c1, *wxp = (v4sf*) wx->c1, *dup = (v4sf*) du->c1; - - for( i=0 ; izero); - uup+=1; wxp+=1; dup+=1; - } - - } - - // add flow increment to current flow - memcpy(wx->c1,uu->c1,uu->stride*uu->height*sizeof(float)); - - // free memory - image_delete(du); image_delete(wy_dummy); - image_delete(mask); - image_delete(smooth_horiz); image_delete(smooth_vert); - image_delete(uu); - image_delete(a11); - image_delete(b1); - - color_image_delete(w_im2); - color_image_delete(Ix); color_image_delete(Iy); color_image_delete(Iz); - color_image_delete(Ixx); color_image_delete(Ixy); color_image_delete(Iyy); color_image_delete(Ixz); color_image_delete(Iyz); - - } - - VarRefClass::~VarRefClass() { } } diff --git a/src/refine_variational.h b/src/refine_variational.h index ca469f1..9f98d99 100644 --- a/src/refine_variational.h +++ b/src/refine_variational.h @@ -9,7 +9,12 @@ namespace OFC { - typedef __v4sf v4sf; + // typedef __v4sf v4sf; +#if (VECTOR_WIDTH == 4) + typedef float32x4_t v4sf; +#else + typedef float v4sf; +#endif typedef struct { @@ -42,7 +47,6 @@ namespace OFC { void copyimage(const float* img, color_image_t * img_t); void RefLevelOF(image_t *wx, image_t *wy, const color_image_t *im1, const color_image_t *im2); - void RefLevelDE(image_t *wx, const color_image_t *im1, const color_image_t *im2); VR_params vr; const img_params* i_params;