diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c1b4731..1557bcc 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -76,6 +76,7 @@ if (ARCH STREQUAL "ARM") endif() set(KERNELS + kernels/flowUtil.cu kernels/densify.cu kernels/interpolate.cu kernels/extract.cu @@ -98,16 +99,17 @@ set(CODEFILES # patch.cpp patchgrid.cpp refine_variational.cpp - FDF1.0.1/image.c - FDF1.0.1/opticalflow_aux.c - FDF1.0.1/solver.c + # FDF1.0.1/image.c + FDF1.0.1/image.cpp + FDF1.0.1/opticalflow_aux.cpp + FDF1.0.1/solver.cpp ) # GrayScale, Optical Flow cuda_add_executable(flow ${COMMON} ${CODEFILES} ${KERNELS}) set_target_properties (flow PROPERTIES COMPILE_DEFINITIONS "SELECTMODE=1") -set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") # use RGB image set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") # 8 wide SIMD (4 floats) +set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "UNIFIED_MEM=1") # Use zero copy memory # set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD target_link_libraries(flow ${OpenCV_LIBS}) diff --git a/src/FDF1.0.1/image.c b/src/FDF1.0.1/image.cpp similarity index 97% rename from src/FDF1.0.1/image.c rename to src/FDF1.0.1/image.cpp index d8f044e..146caeb 100644 --- a/src/FDF1.0.1/image.c +++ b/src/FDF1.0.1/image.cpp @@ -6,6 +6,10 @@ #include "image.h" +#include +#include +#include "../common/cuda_helper.h" + // #include // typedef __v4sf v4sf; @@ -29,7 +33,11 @@ image_t *image_new(const int width, const int height){ image->width = width; image->height = height; image->stride = ( (width+3) / 4 ) * 4; +#if (UNIFIED_MEM) + checkCudaErrors( cudaHostAlloc((void**) &image->c1, image->stride*height*sizeof(float), cudaHostAllocMapped) ); +#else image->c1 = (float*) memalign(16, image->stride*height*sizeof(float)); +#endif if(image->c1== NULL){ fprintf(stderr, "Error: image_new() - not enough memory !\n"); exit(1); @@ -70,7 +78,11 @@ void image_delete(image_t *image){ if(image == NULL){ //fprintf(stderr, "Warning: Delete image --> Ignore action (image not allocated)\n"); }else{ +#if (UNIFIED_MEM) + cudaFree(image->c1); +#else free(image->c1); +#endif free(image); } } @@ -86,7 +98,11 @@ color_image_t *color_image_new(const int width, const int height){ image->width = width; image->height = height; image->stride = ( (width+VECTOR_WIDTH-1) / VECTOR_WIDTH ) * VECTOR_WIDTH; +#if (UNIFIED_MEM) + checkCudaErrors( cudaHostAlloc((void**) &image->c1, 3*image->stride*height*sizeof(float), cudaHostAllocMapped) ); +#else image->c1 = (float*) memalign(16, 3*image->stride*height*sizeof(float)); +#endif if(image->c1 == NULL){ fprintf(stderr, "Error: color_image_new() - not enough memory !\n"); exit(1); @@ -111,7 +127,11 @@ void color_image_erase(color_image_t *image){ /* free memory of a color image */ void color_image_delete(color_image_t *image){ if(image){ +#if (UNIFIED_MEM) + cudaFree(image->c1); +#else free(image->c1); // c2 and c3 was allocated at the same moment +#endif free(image); } } @@ -665,7 +685,7 @@ void color_image_convolve_hv(color_image_t *dst, const color_image_t *src, const dst_red = {width,height,stride,dst->c1}, dst_green = {width,height,stride,dst->c2}, dst_blue = {width,height,stride,dst->c3}; // horizontal and vertical if(horiz_conv != NULL && vert_conv != NULL){ - float *tmp_data = malloc(sizeof(float)*stride*height); + float *tmp_data = (float*) malloc(sizeof(float)*stride*height); if(tmp_data == NULL){ fprintf(stderr,"error color_image_convolve_hv(): not enough memory\n"); exit(1); @@ -699,7 +719,7 @@ void image_convolve_hv(image_t *dst, const image_t *src, const convolution_t *ho dst_red = {width,height,stride,dst->c1}; // horizontal and vertical if(horiz_conv != NULL && vert_conv != NULL){ - float *tmp_data = malloc(sizeof(float)*stride*height); + float *tmp_data = (float*) malloc(sizeof(float)*stride*height); if(tmp_data == NULL){ fprintf(stderr,"error image_convolve_hv(): not enough memory\n"); exit(1); diff --git a/src/FDF1.0.1/opticalflow_aux.c b/src/FDF1.0.1/opticalflow_aux.c deleted file mode 100644 index 9b6648e..0000000 --- a/src/FDF1.0.1/opticalflow_aux.c +++ /dev/null @@ -1,384 +0,0 @@ -#include -#include -#include -#include -#include "opticalflow_aux.h" - -#include - -#if (VECTOR_WIDTH == 4) -typedef float32x4_t v4sf; -#else -typedef float v4sf; -#endif - -#define datanorm 0.1f*0.1f//0.01f // square of the normalization factor -#define epsilon_color (0.001f*0.001f)//0.000001f -#define epsilon_grad (0.001f*0.001f)//0.000001f -#define epsilon_desc (0.001f*0.001f)//0.000001f -#define epsilon_smooth (0.001f*0.001f)//0.000001f - -/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */ -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void image_warp(image_t *dst, image_t *mask, const image_t *src, const image_t *wx, const image_t *wy) -#else -void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy) -#endif -{ - int i, j, offset, incr_line = mask->stride-mask->width, x, y, x1, x2, y1, y2; - float xx, yy, dx, dy; - for(j=0,offset=0 ; jheight ; j++) - { - for(i=0 ; iwidth ; i++,offset++) - { - xx = i+wx->c1[offset]; - yy = j+wy->c1[offset]; - x = floor(xx); - y = floor(yy); - dx = xx-x; - dy = yy-y; - mask->c1[offset] = (xx>=0 && xx<=src->width-1 && yy>=0 && yy<=src->height-1); - x1 = MINMAX_TA(x,src->width); - x2 = MINMAX_TA(x+1,src->width); - y1 = MINMAX_TA(y,src->height); - y2 = MINMAX_TA(y+1,src->height); - dst->c1[offset] = - src->c1[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) + - src->c1[y1*src->stride+x2]*dx*(1.0f-dy) + - src->c1[y2*src->stride+x1]*(1.0f-dx)*dy + - src->c1[y2*src->stride+x2]*dx*dy; -#if (SELECTCHANNEL==3) - dst->c2[offset] = - src->c2[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) + - src->c2[y1*src->stride+x2]*dx*(1.0f-dy) + - src->c2[y2*src->stride+x1]*(1.0f-dx)*dy + - src->c2[y2*src->stride+x2]*dx*dy; - dst->c3[offset] = - src->c3[y1*src->stride+x1]*(1.0f-dx)*(1.0f-dy) + - src->c3[y1*src->stride+x2]*dx*(1.0f-dy) + - src->c3[y2*src->stride+x1]*(1.0f-dx)*dy + - src->c3[y2*src->stride+x2]*dx*dy; -#endif - } - offset += incr_line; - } -} - - -/* compute image first and second order spatio-temporal derivatives of a color image */ -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void get_derivatives(const image_t *im1, const image_t *im2, const convolution_t *deriv, - image_t *dx, image_t *dy, image_t *dt, - image_t *dxx, image_t *dxy, image_t *dyy, image_t *dxt, image_t *dyt) -#else -void get_derivatives(const color_image_t *im1, const color_image_t *im2, const convolution_t *deriv, - color_image_t *dx, color_image_t *dy, color_image_t *dt, - color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt) -#endif -{ - // derivatives are computed on the mean of the first image and the warped second image -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) - image_t *tmp_im2 = image_new(im2->width,im2->height); - v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1; -#if (VECTOR_WIDTH == 4) - const v4sf half = {0.5f,0.5f,0.5f,0.5f}; -#else - const v4sf half = 0.5f; -#endif - int i=0; - for(i=0 ; iheight*im1->stride/VECTOR_WIDTH ; i++){ - *tmp_im2p = half * ( (*im2p) + (*im1p) ); - *dtp = (*im2p)-(*im1p); - dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1; - } - // compute all other derivatives - image_convolve_hv(dx, tmp_im2, deriv, NULL); - image_convolve_hv(dy, tmp_im2, NULL, deriv); - image_convolve_hv(dxx, dx, deriv, NULL); - image_convolve_hv(dxy, dx, NULL, deriv); - image_convolve_hv(dyy, dy, NULL, deriv); - image_convolve_hv(dxt, dt, deriv, NULL); - image_convolve_hv(dyt, dt, NULL, deriv); - // free memory - image_delete(tmp_im2); -#else - color_image_t *tmp_im2 = color_image_new(im2->width,im2->height); - v4sf *tmp_im2p = (v4sf*) tmp_im2->c1, *dtp = (v4sf*) dt->c1, *im1p = (v4sf*) im1->c1, *im2p = (v4sf*) im2->c1; -#if (VECTOR_WIDTH == 4) - const v4sf half = {0.5f,0.5f,0.5f,0.5f}; -#else - const v4sf half = 0.5f; -#endif - int i=0; - for(i=0 ; i<3*im1->height*im1->stride/VECTOR_WIDTH; i++){ - *tmp_im2p = half * ( (*im2p) + (*im1p) ); - *dtp = (*im2p)-(*im1p); - dtp+=1; im1p+=1; im2p+=1; tmp_im2p+=1; - } - // compute all other derivatives - color_image_convolve_hv(dx, tmp_im2, deriv, NULL); - color_image_convolve_hv(dy, tmp_im2, NULL, deriv); - color_image_convolve_hv(dxx, dx, deriv, NULL); - color_image_convolve_hv(dxy, dx, NULL, deriv); - color_image_convolve_hv(dyy, dy, NULL, deriv); - color_image_convolve_hv(dxt, dt, deriv, NULL); - color_image_convolve_hv(dyt, dt, NULL, deriv); - // free memory - color_image_delete(tmp_im2); -#endif -} - - -/* compute the smoothness term */ -/* It is represented as two images, the first one for horizontal smoothness, the second for vertical - in dst_horiz, the pixel i,j represents the smoothness weight between pixel i,j and i,j+1 - in dst_vert, the pixel i,j represents the smoothness weight between pixel i,j and i+1,j */ -void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, const convolution_t *deriv_flow, const float quarter_alpha){ - const int width = uu->width, height = vv->height, stride = uu->stride; - int j; - image_t *ux = image_new(width,height), *vx = image_new(width,height), *uy = image_new(width,height), *vy = image_new(width,height), *smoothness = image_new(width,height); - // compute derivatives [-0.5 0 0.5] - convolve_horiz(ux, uu, deriv_flow); - convolve_horiz(vx, vv, deriv_flow); - convolve_vert(uy, uu, deriv_flow); - convolve_vert(vy, vv, deriv_flow); - // compute smoothness - v4sf *uxp = (v4sf*) ux->c1, *vxp = (v4sf*) vx->c1, *uyp = (v4sf*) uy->c1, *vyp = (v4sf*) vy->c1, *sp = (v4sf*) smoothness->c1; -#if (VECTOR_WIDTH == 4) - const v4sf qa = {quarter_alpha,quarter_alpha,quarter_alpha,quarter_alpha}; - const v4sf epsmooth = {epsilon_smooth,epsilon_smooth,epsilon_smooth,epsilon_smooth}; -#else - const v4sf qa = quarter_alpha; - const v4sf epsmooth = epsilon_smooth; -#endif - for(j=0 ; j< height*stride/VECTOR_WIDTH ; j++){ -#if (VECTOR_WIDTH == 4) - *sp = qa / vsqrtq_f32( - (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth ); -#else - *sp = qa / sqrtf( - (*uxp)*(*uxp) + (*uyp)*(*uyp) + (*vxp)*(*vxp) + (*vyp)*(*vyp) + epsmooth ); -#endif - sp+=1;uxp+=1; uyp+=1; vxp+=1; vyp+=1; - } - image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); - // compute dst_horiz - v4sf *dsthp = (v4sf*) dst_horiz->c1; sp = (v4sf*) smoothness->c1; - float *sp_shift = (float*) memalign(16, stride*sizeof(float)); // aligned shifted copy of the current line - for(j=0;jc1[j*stride+width-1], 0, sizeof(float)*(stride-width+1)); - } - free(sp_shift); - // compute dst_vert - v4sf *dstvp = (v4sf*) dst_vert->c1, *sp_bottom = (v4sf*) (smoothness->c1+stride); sp = (v4sf*) smoothness->c1; - for(j=0 ; j<(height-1)*stride/VECTOR_WIDTH ; j++){ - *dstvp = (*sp) + (*sp_bottom); - dstvp+=1; sp+=1; sp_bottom+=1; - } - memset( &dst_vert->c1[(height-1)*stride], 0, sizeof(float)*stride); - image_delete(smoothness); -} - - - - - -/* sub the laplacian (smoothness term) to the right-hand term */ -void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert){ - int j; - const int offsetline = src->stride-src->width; - float *src_ptr = src->c1, *dst_ptr = dst->c1, *weight_horiz_ptr = weight_horiz->c1; - // horizontal filtering - for(j=src->height+1;--j;){ // faster than for(j=0;jheight;j++) - int i; - for(i=src->width;--i;){ - const float tmp = (*weight_horiz_ptr)*((*(src_ptr+1))-(*src_ptr)); - *dst_ptr += tmp; - *(dst_ptr+1) -= tmp; - dst_ptr++; - src_ptr++; - weight_horiz_ptr++; - } - dst_ptr += offsetline+1; - src_ptr += offsetline+1; - weight_horiz_ptr += offsetline+1; - } - - v4sf *wvp = (v4sf*) weight_vert->c1, *srcp = (v4sf*) src->c1, *srcp_s = (v4sf*) (src->c1+src->stride), *dstp = (v4sf*) dst->c1, *dstp_s = (v4sf*) (dst->c1+src->stride); - for(j=1+(src->height-1)*src->stride/VECTOR_WIDTH ; --j ;){ - const v4sf tmp = (*wvp) * ((*srcp_s)-(*srcp)); - *dstp += tmp; - *dstp_s -= tmp; - wvp+=1; srcp+=1; srcp_s+=1; dstp+=1; dstp_s+=1; - } -} - -/* compute the dataterm // REMOVED MATCHING TERM - a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side - other (color) images are input */ -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) -#else -void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) -#endif -{ -#if (VECTOR_WIDTH == 4) - const v4sf dnorm = {datanorm, datanorm, datanorm, datanorm}; - const v4sf hdover3 = {half_delta_over3, half_delta_over3, half_delta_over3, half_delta_over3}; - const v4sf epscolor = {epsilon_color, epsilon_color, epsilon_color, epsilon_color}; - const v4sf hgover3 = {half_gamma_over3, half_gamma_over3, half_gamma_over3, half_gamma_over3}; - const v4sf epsgrad = {epsilon_grad, epsilon_grad, epsilon_grad, epsilon_grad}; - //const v4sf hbeta = {half_beta,half_beta,half_beta,half_beta}; - //const v4sf epsdesc = {epsilon_desc,epsilon_desc,epsilon_desc,epsilon_desc}; -#else - const v4sf dnorm = datanorm; - const v4sf hdover3 = half_delta_over3; - const v4sf epscolor = epsilon_color; - const v4sf hgover3 = half_gamma_over3; - const v4sf epsgrad = epsilon_grad; -#endif - - v4sf *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1, - *maskp = (v4sf*) mask->c1, - *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, - *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, - *ix1p=(v4sf*)Ix->c1, *iy1p=(v4sf*)Iy->c1, *iz1p=(v4sf*)Iz->c1, *ixx1p=(v4sf*)Ixx->c1, *ixy1p=(v4sf*)Ixy->c1, *iyy1p=(v4sf*)Iyy->c1, *ixz1p=(v4sf*)Ixz->c1, *iyz1p=(v4sf*) Iyz->c1, -#if (SELECTCHANNEL==3) - *ix2p=(v4sf*)Ix->c2, *iy2p=(v4sf*)Iy->c2, *iz2p=(v4sf*)Iz->c2, *ixx2p=(v4sf*)Ixx->c2, *ixy2p=(v4sf*)Ixy->c2, *iyy2p=(v4sf*)Iyy->c2, *ixz2p=(v4sf*)Ixz->c2, *iyz2p=(v4sf*) Iyz->c2, - *ix3p=(v4sf*)Ix->c3, *iy3p=(v4sf*)Iy->c3, *iz3p=(v4sf*)Iz->c3, *ixx3p=(v4sf*)Ixx->c3, *ixy3p=(v4sf*)Ixy->c3, *iyy3p=(v4sf*)Iyy->c3, *ixz3p=(v4sf*)Ixz->c3, *iyz3p=(v4sf*) Iyz->c3, -#endif - *uup = (v4sf*) uu->c1, *vvp = (v4sf*)vv->c1, *wxp = (v4sf*)wx->c1, *wyp = (v4sf*)wy->c1; - - - memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride); - memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride); - memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride); - - int i; - for(i = 0 ; iheight*uu->stride/VECTOR_WIDTH ; i++){ - v4sf tmp, tmp2, n1, n2; -#if (SELECTCHANNEL==3) - v4sf tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6; -#endif - // dpsi color - if(half_delta_over3){ - tmp = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp); - n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm; -#if (SELECTCHANNEL==3) - tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp); - n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm; - tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp); - n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm; -#if (VECTOR_WIDTH == 4) - tmp = (*maskp) * hdover3 / vsqrtq_f32(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); -#else - tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); -#endif - tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; -#else -#if (VECTOR_WIDTH == 4) - tmp = (*maskp) * hdover3 / vsqrtq_f32(3 * tmp*tmp/n1 + epscolor); -#else - tmp = (*maskp) * hdover3 / sqrtf(3 * tmp*tmp/n1 + epscolor); -#endif - tmp /= n1; -#endif - *a11p += tmp * (*ix1p) * (*ix1p); - *a12p += tmp * (*ix1p) * (*iy1p); - *a22p += tmp * (*iy1p) * (*iy1p); - *b1p -= tmp * (*iz1p) * (*ix1p); - *b2p -= tmp * (*iz1p) * (*iy1p); -#if (SELECTCHANNEL==3) - *a11p += tmp2 * (*ix2p) * (*ix2p); - *a12p += tmp2 * (*ix2p) * (*iy2p); - *a22p += tmp2 * (*iy2p) * (*iy2p); - *b1p -= tmp2 * (*iz2p) * (*ix2p); - *b2p -= tmp2 * (*iz2p) * (*iy2p); - *a11p += tmp3 * (*ix3p) * (*ix3p); - *a12p += tmp3 * (*ix3p) * (*iy3p); - *a22p += tmp3 * (*iy3p) * (*iy3p); - *b1p -= tmp3 * (*iz3p) * (*ix3p); - *b2p -= tmp3 * (*iz3p) * (*iy3p); -#endif - } - - // dpsi gradient - n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm; - n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm; - tmp = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp); - tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp); -#if (SELECTCHANNEL==3) - n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm; - n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm; - tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp); - tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp); - n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm; - n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm; - tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp); - tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp); -#if (VECTOR_WIDTH == 4) - tmp = (*maskp) * hgover3 / vsqrtq_f32( - tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); -#else - tmp = (*maskp) * hgover3 / sqrtf( - tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); -#endif - tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; -#else -#if (VECTOR_WIDTH == 4) - tmp = (*maskp) * hgover3 / vsqrtq_f32(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); -#else - tmp = (*maskp) * hgover3 / sqrtf(3* tmp*tmp/n1 + 3* tmp2*tmp2/n2 + epsgrad); -#endif - tmp2 = tmp/n2; tmp /= n1; -#endif - *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p); - *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p); - *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p); - *b1p -= tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p); - *b2p -= tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p); -#if (SELECTCHANNEL==3) - *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p); - *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p); - *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p); - *b1p -= tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p); - *b2p -= tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p); - *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p); - *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p); - *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p); - *b1p -= tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p); - *b2p -= tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p); -#endif - - -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // multiply system to make smoothing parameters same for RGB and single-channel image - *a11p *= 3; - *a12p *= 3; - *a22p *= 3; - *b1p *= 3; - *b2p *= 3; - -#endif - - dup+=1; dvp+=1; maskp+=1; a11p+=1; a12p+=1; a22p+=1; b1p+=1; b2p+=1; - ix1p+=1; iy1p+=1; iz1p+=1; ixx1p+=1; ixy1p+=1; iyy1p+=1; ixz1p+=1; iyz1p+=1; -#if (SELECTCHANNEL==3) - ix2p+=1; iy2p+=1; iz2p+=1; ixx2p+=1; ixy2p+=1; iyy2p+=1; ixz2p+=1; iyz2p+=1; - ix3p+=1; iy3p+=1; iz3p+=1; ixx3p+=1; ixy3p+=1; iyy3p+=1; ixz3p+=1; iyz3p+=1; -#endif - uup+=1;vvp+=1;wxp+=1; wyp+=1; - - } -} - diff --git a/src/FDF1.0.1/opticalflow_aux.cpp b/src/FDF1.0.1/opticalflow_aux.cpp new file mode 100644 index 0000000..bc7f956 --- /dev/null +++ b/src/FDF1.0.1/opticalflow_aux.cpp @@ -0,0 +1,115 @@ +#include +#include +#include +#include + +#include "opticalflow_aux.h" +#include "../kernels/flowUtil.h" +#include "../common/timer.h" + +#include + +using namespace timer; + +#if (VECTOR_WIDTH == 4) +typedef float32x4_t v4sf; +#else +typedef float v4sf; +#endif + +#define datanorm 0.1f*0.1f//0.01f // square of the normalization factor +#define epsilon_color (0.001f*0.001f)//0.000001f +#define epsilon_grad (0.001f*0.001f)//0.000001f +#define epsilon_desc (0.001f*0.001f)//0.000001f +#define epsilon_smooth (0.001f*0.001f)//0.000001f + +/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */ +void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy) +{ + + cu::warpImage(dst, mask, src, wx, wy); + +} + + +/* compute image first and second order spatio-temporal derivatives of a color image */ +void get_derivatives( + const color_image_t *im1, const color_image_t *im2, float *pDeviceKernel, + color_image_t *dx, color_image_t *dy, color_image_t *dt, + color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt) +{ + // derivatives are computed on the mean of the first image and the warped second image + color_image_t *tmp_im2 = color_image_new(im2->width,im2->height); + + int height = im2->height; + int width = im2->width; + int stride = im2->stride; + + cu::getMeanImageAndDiff(im1->c1, im2->c1, tmp_im2->c1, dt->c1, im1->height, im1->stride); + + // compute all other derivatives + cu::colorImageDerivative(dx->c1, tmp_im2->c1, pDeviceKernel, height, width, stride, true); // horizontal + cu::colorImageDerivative(dy->c1, tmp_im2->c1, pDeviceKernel, height, width, stride, false); + cu::colorImageDerivative(dxx->c1, dx->c1, pDeviceKernel, height, width, stride, true); + cu::colorImageDerivative(dxy->c1, dx->c1, pDeviceKernel, height, width, stride, false); + cu::colorImageDerivative(dyy->c1, dy->c1, pDeviceKernel, height, width, stride, false); + cu::colorImageDerivative(dxt->c1, dt->c1, pDeviceKernel, height, width, stride, true); + cu::colorImageDerivative(dyt->c1, dt->c1, pDeviceKernel, height, width, stride, false); + + // free memory + color_image_delete(tmp_im2); +} + + +/* compute the smoothness term */ +/* It is represented as two images, the first one for horizontal smoothness, the second for vertical + in dst_horiz, the pixel i,j represents the smoothness weight between pixel i,j and i,j+1 + in dst_vert, the pixel i,j represents the smoothness weight between pixel i,j and i+1,j */ +void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, float *deriv_flow, const float quarter_alpha){ + const int width = uu->width, height = vv->height, stride = uu->stride; + int j; + image_t *ux = image_new(width,height), *vx = image_new(width,height), *uy = image_new(width,height), *vy = image_new(width,height), *smoothness = image_new(width,height); + + // compute derivatives [-0.5 0 0.5] + cu::imageDerivative(ux->c1, uu->c1, deriv_flow, height, width, stride, true); + cu::imageDerivative(vx->c1, vv->c1, deriv_flow, height, width, stride, true); + cu::imageDerivative(uy->c1, uu->c1, deriv_flow, height, width, stride, false); + cu::imageDerivative(vy->c1, vv->c1, deriv_flow, height, width, stride, false); + + cu::smoothnessTerm( + dst_horiz->c1, dst_vert->c1, smoothness->c1, + ux->c1, uy->c1, vx->c1, vy->c1, + quarter_alpha, epsilon_smooth, + height, width, stride); + + // Cleanup extra columns + for(j=0;jc1[j*stride+width-1], 0, sizeof(float)*(stride-width+1)); + } + // Cleanup last row + memset( &dst_vert->c1[(height-1)*stride], 0, sizeof(float)*stride); + + image_delete(ux); image_delete(uy); image_delete(vx); image_delete(vy); + image_delete(smoothness); +} + + + + + +/* sub the laplacian (smoothness term) to the right-hand term */ +void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert){ + + cu::subLaplacianHoriz(src->c1, dst->c1, weight_horiz->c1, src->height, src->width, src->stride); + + cu::subLaplacianVert(src->c1, dst->c1, weight_vert->c1, src->height, src->stride); +} + +/* compute the dataterm // REMOVED MATCHING TERM + a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side + other (color) images are input */ +void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3) +{ + return cu::dataTerm(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, half_delta_over3, half_beta, half_gamma_over3); +} + diff --git a/src/FDF1.0.1/opticalflow_aux.h b/src/FDF1.0.1/opticalflow_aux.h index ebca1f1..f78443d 100644 --- a/src/FDF1.0.1/opticalflow_aux.h +++ b/src/FDF1.0.1/opticalflow_aux.h @@ -9,25 +9,14 @@ extern "C" { #include "image.h" -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image -/* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */ -void image_warp(image_t *dst, image_t *mask, const image_t *src, const image_t *wx, const image_t *wy); - -/* compute image first and second order spatio-temporal derivatives of a color image */ -void get_derivatives(const image_t *im1, const image_t *im2, const convolution_t *deriv, image_t *dx, image_t *dy, image_t *dt, image_t *dxx, image_t *dxy, image_t *dyy, image_t *dxt, image_t *dyt); - -#else // use RGB image_new /* warp a color image according to a flow. src is the input image, wx and wy, the input flow. dst is the warped image and mask contains 0 or 1 if the pixels goes outside/inside image boundaries */ void image_warp(color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy); /* compute image first and second order spatio-temporal derivatives of a color image */ -void get_derivatives(const color_image_t *im1, const color_image_t *im2, const convolution_t *deriv, color_image_t *dx, color_image_t *dy, color_image_t *dt, color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt); +void get_derivatives(const color_image_t *im1, const color_image_t *im2, float *deriv, color_image_t *dx, color_image_t *dy, color_image_t *dt, color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt); -#endif - - /* compute the smoothness term */ -void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, const convolution_t *deriv_flow, const float quarter_alpha); +void compute_smoothness(image_t *dst_horiz, image_t *dst_vert, const image_t *uu, const image_t *vv, float *kernel, const float quarter_alpha); // void compute_smoothness_SF(image_t *dst_horiz, image_t *dst_vert, const image_t *xx1, const image_t *xx2, const image_t *yy, const image_t *xx3, const convolution_t *deriv_flow, const float quarter_alpha); /* sub the laplacian (smoothness term) to the right-hand term */ @@ -36,12 +25,7 @@ void sub_laplacian(image_t *dst, const image_t *src, const image_t *weight_horiz /* compute the dataterm and ... REMOVED THE MATCHING TERM a11 a12 a22 represents the 2x2 diagonal matrix, b1 and b2 the right hand side other (color) images are input */ -#if (SELECTCHANNEL==1 | SELECTCHANNEL==2) // use single band image_delete -void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, image_t *Ix, image_t *Iy, image_t *Iz, image_t *Ixx, image_t *Ixy, image_t *Iyy, image_t *Ixz, image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); -#else void compute_data(image_t *a11, image_t *a12, image_t *a22, image_t *b1, image_t *b2, image_t *mask, image_t *wx, image_t *wy, image_t *du, image_t *dv, image_t *uu, image_t *vv, color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); -#endif - #ifdef __cplusplus } diff --git a/src/FDF1.0.1/solver.c b/src/FDF1.0.1/solver.c deleted file mode 100644 index 162b3a0..0000000 --- a/src/FDF1.0.1/solver.c +++ /dev/null @@ -1,426 +0,0 @@ -#include -#include -#include -#include -#include - -#include - -#include "image.h" -#include "solver.h" - -// #include -// #if (VECTOR_WIDTH == 4) -// typedef float32x4_t v4sf; -// #else -// typedef float v4sf; -// #endif - -//THIS IS A SLOW VERSION BUT READABLE -//Perform n iterations of the sor_coupled algorithm -//du and dv are used as initial guesses -//The system form is the same as in opticalflow.c -void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega) -{ - int i,j,iter; - for(iter = 0 ; iterheight ; j++) - { - float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2;//,det; - for(i=0 ; iwidth ; i++) - { - sigma_u = 0.0f; - sigma_v = 0.0f; - sum_dpsis = 0.0f; - if(j>0) - { - sigma_u -= dpsis_vert->c1[(j-1)*du->stride+i]*du->c1[(j-1)*du->stride+i]; - sigma_v -= dpsis_vert->c1[(j-1)*du->stride+i]*dv->c1[(j-1)*du->stride+i]; - sum_dpsis += dpsis_vert->c1[(j-1)*du->stride+i]; - } - if(i>0) - { - sigma_u -= dpsis_horiz->c1[j*du->stride+i-1]*du->c1[j*du->stride+i-1]; - sigma_v -= dpsis_horiz->c1[j*du->stride+i-1]*dv->c1[j*du->stride+i-1]; - sum_dpsis += dpsis_horiz->c1[j*du->stride+i-1]; - } - if(jheight-1) - { - sigma_u -= dpsis_vert->c1[j*du->stride+i]*du->c1[(j+1)*du->stride+i]; - sigma_v -= dpsis_vert->c1[j*du->stride+i]*dv->c1[(j+1)*du->stride+i]; - sum_dpsis += dpsis_vert->c1[j*du->stride+i]; - } - if(iwidth-1) - { - sigma_u -= dpsis_horiz->c1[j*du->stride+i]*du->c1[j*du->stride+i+1]; - sigma_v -= dpsis_horiz->c1[j*du->stride+i]*dv->c1[j*du->stride+i+1]; - sum_dpsis += dpsis_horiz->c1[j*du->stride+i]; - } - A11 = a11->c1[j*du->stride+i]+sum_dpsis; - A12 = a12->c1[j*du->stride+i]; - A22 = a22->c1[j*du->stride+i]+sum_dpsis; - //det = A11*A22-A12*A12; - B1 = b1->c1[j*du->stride+i]-sigma_u; - B2 = b2->c1[j*du->stride+i]-sigma_v; -// du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] +omega*( A22*B1-A12*B2)/det; -// dv->c1[j*du->stride+i] = (1.0f-omega)*dv->c1[j*du->stride+i] +omega*(-A12*B1+A11*B2)/det; - du->c1[j*du->stride+i] = (1.0f-omega)*du->c1[j*du->stride+i] + omega/A11 *(B1 - A12* dv->c1[j*du->stride+i] ); - dv->c1[j*du->stride+i] = (1.0f-omega)*dv->c1[j*du->stride+i] + omega/A22 *(B2 - A12* du->c1[j*du->stride+i] ); - - - } - } - } -} - - // THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION - // the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal - // each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line -// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){ -// //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n"); -// -// if(du->width<2 || du->height<2 || iterations < 1){ -// sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); -// return; -// } -// -// const int stride = du->stride, width = du->width; -// const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1); -// int j,iter,i,k; -// float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); -// if(floatarray==NULL){ -// fprintf(stderr, "error in sor_coupled(): not enough memory\n"); -// exit(1); -// } -// float *f1 = floatarray; -// float *f2 = f1+stride; -// float *f3 = f2+stride; -// f1[0] = 0.0f; -// memset(&f1[width], 0, sizeof(float)*(stride-width)); -// memset(&f2[width-1], 0, sizeof(float)*(stride-width+1)); -// memset(&f3[width-1], 0, sizeof(float)*(stride-width+1)); -// -// { // first iteration -// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; -// float *du_ptr = du->c1, *dv_ptr = dv->c1; -// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); -// -// { // first iteration - first line -// -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vp); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// for(i=iterline;--i;){ -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vp); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); -// for(k=0;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// -// v4sf *vpt = (v4sf*) dpsis_vert->c1; -// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; -// -// for(j=iterheight;--j;){ // first iteration - middle lines -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// for(i=iterline;--i;){ -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); -// for(k=0;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// -// { // first iteration - last line -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// for(i=iterline;--i;){ -// // reverse 2x2 diagonal block -// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); -// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; -// const v4sf det = A11*A22 - (*a12p)*(*a12p); -// *a11p = A11/det; -// *a22p = A22/det; -// *a12p /= -det; -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); -// for(k=0;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// } -// -// -// -// for(iter=iterations;--iter;) // other iterations -// { -// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; -// float *du_ptr = du->c1, *dv_ptr = dv->c1; -// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); -// -// { // other iteration - first line -// -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// for(i=iterline;--i;){ -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); -// for(k=0;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// -// v4sf *vpt = (v4sf*) dpsis_vert->c1; -// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; -// -// for(j=iterheight;--j;) // other iteration - middle lines -// { -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++) -// { -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// for(i=iterline; --i;) -// { -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); -// for(k=0;k<4;k++) -// { -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// -// { // other iteration - last line -// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); -// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); -// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); -// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; -// -// { // left block -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); -// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); -// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); -// for(k=1;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// for(i=iterline;--i;){ -// // do one iteration -// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); -// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); -// for(k=0;k<4;k++){ -// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; -// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; -// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); -// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); -// } -// // increment pointer -// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; -// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; -// du_ptr += 4; dv_ptr += 4; -// } -// -// } -// } -// -// -// -// free(floatarray); -// -// } - diff --git a/src/kernels/flowUtil.cu b/src/kernels/flowUtil.cu new file mode 100644 index 0000000..fd55be5 --- /dev/null +++ b/src/kernels/flowUtil.cu @@ -0,0 +1,892 @@ +// System +#include +#include +#include +#include + +// CUDA +#include +#include + +// NVIDIA Perf Primitives +#include +#include + +#include "../common/timer.h" +#include "../FDF1.0.1/image.h" +#include "flowUtil.h" + +using namespace timer; + +#define datanorm 0.1f*0.1f //0.01f // square of the normalization factor +#define epsilon_color (0.001f*0.001f) //0.000001f +#define epsilon_grad (0.001f*0.001f) //0.000001f +#define epsilon_desc (0.001f*0.001f) //0.000001f +#define epsilon_smooth (0.001f*0.001f) //0.000001f + +__global__ void kernelDataTerm( + float *a11c1, float *a12c1, float *a22c1, + float *b1c1, float *b2c1, + float *maskc1, + float *wxc1, float *wyc1, + float *duc1, float *dvc1, + float *uuc1, float *vvc1, + float *Ixc1, float *Ixc2, float *Ixc3, + float *Iyc1, float *Iyc2, float *Iyc3, + float *Izc1, float *Izc2, float *Izc3, + float *Ixxc1, float *Ixxc2, float *Ixxc3, + float *Ixyc1, float *Ixyc2, float *Ixyc3, + float *Iyyc1, float *Iyyc2, float *Iyyc3, + float *Ixzc1, float *Ixzc2, float *Ixzc3, + float *Iyzc1, float *Iyzc2, float *Iyzc3, + const float half_delta_over3, const float half_beta, const float half_gamma_over3, int N) { + + int tidx = blockDim.x * blockIdx.x + threadIdx.x; + + if (tidx < N) { + + const float dnorm = datanorm; + const float hdover3 = half_delta_over3; + const float epscolor = epsilon_color; + const float hgover3 = half_gamma_over3; + const float epsgrad = epsilon_grad; + + float *dup = (float*) duc1 + tidx, + *dvp = (float*) dvc1 + tidx, + *maskp = (float*) maskc1 + tidx, + *a11p = (float*) a11c1 + tidx, + *a12p = (float*) a12c1 + tidx, + *a22p = (float*) a22c1 + tidx, + *b1p = (float*) b1c1 + tidx, + *b2p = (float*) b2c1 + tidx, + *ix1p = (float*) Ixc1 + tidx, + *iy1p=(float*)Iyc1 + tidx, + *iz1p=(float*)Izc1 + tidx, + *ixx1p=(float*)Ixxc1 + tidx, + *ixy1p=(float*)Ixyc1 + tidx, + *iyy1p=(float*)Iyyc1 + tidx, + *ixz1p=(float*)Ixzc1 + tidx, + *iyz1p=(float*) Iyzc1 + tidx, + *ix2p = (float*) Ixc2 + tidx, + *iy2p=(float*)Iyc2 + tidx, + *iz2p=(float*)Izc2 + tidx, + *ixx2p=(float*)Ixxc2 + tidx, + *ixy2p=(float*)Ixyc2 + tidx, + *iyy2p=(float*)Iyyc2 + tidx, + *ixz2p=(float*)Ixzc2 + tidx, + *iyz2p=(float*) Iyzc2 + tidx, + *ix3p = (float*) Ixc3 + tidx, + *iy3p=(float*)Iyc3 + tidx, + *iz3p=(float*)Izc3 + tidx, + *ixx3p=(float*)Ixxc3 + tidx, + *ixy3p=(float*)Ixyc3 + tidx, + *iyy3p=(float*)Iyyc3 + tidx, + *ixz3p=(float*)Ixzc3 + tidx, + *iyz3p=(float*) Iyzc3 + tidx; + + + float tmp, tmp2, n1, n2; + float tmp3, tmp4, tmp5, tmp6, n3, n4, n5, n6; + + // dpsi color + if(half_delta_over3){ + tmp = *iz1p + (*ix1p)*(*dup) + (*iy1p)*(*dvp); + n1 = (*ix1p) * (*ix1p) + (*iy1p) * (*iy1p) + dnorm; + tmp2 = *iz2p + (*ix2p)*(*dup) + (*iy2p)*(*dvp); + n2 = (*ix2p) * (*ix2p) + (*iy2p) * (*iy2p) + dnorm; + tmp3 = *iz3p + (*ix3p)*(*dup) + (*iy3p)*(*dvp); + n3 = (*ix3p) * (*ix3p) + (*iy3p) * (*iy3p) + dnorm; + tmp = (*maskp) * hdover3 / sqrtf(tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + epscolor); + tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; + *a11p += tmp * (*ix1p) * (*ix1p); + *a12p += tmp * (*ix1p) * (*iy1p); + *a22p += tmp * (*iy1p) * (*iy1p); + *b1p -= tmp * (*iz1p) * (*ix1p); + *b2p -= tmp * (*iz1p) * (*iy1p); + *a11p += tmp2 * (*ix2p) * (*ix2p); + *a12p += tmp2 * (*ix2p) * (*iy2p); + *a22p += tmp2 * (*iy2p) * (*iy2p); + *b1p -= tmp2 * (*iz2p) * (*ix2p); + *b2p -= tmp2 * (*iz2p) * (*iy2p); + *a11p += tmp3 * (*ix3p) * (*ix3p); + *a12p += tmp3 * (*ix3p) * (*iy3p); + *a22p += tmp3 * (*iy3p) * (*iy3p); + *b1p -= tmp3 * (*iz3p) * (*ix3p); + *b2p -= tmp3 * (*iz3p) * (*iy3p); + } + + // dpsi gradient + n1 = (*ixx1p) * (*ixx1p) + (*ixy1p) * (*ixy1p) + dnorm; + n2 = (*iyy1p) * (*iyy1p) + (*ixy1p) * (*ixy1p) + dnorm; + tmp = *ixz1p + (*ixx1p) * (*dup) + (*ixy1p) * (*dvp); + tmp2 = *iyz1p + (*ixy1p) * (*dup) + (*iyy1p) * (*dvp); + n3 = (*ixx2p) * (*ixx2p) + (*ixy2p) * (*ixy2p) + dnorm; + n4 = (*iyy2p) * (*iyy2p) + (*ixy2p) * (*ixy2p) + dnorm; + tmp3 = *ixz2p + (*ixx2p) * (*dup) + (*ixy2p) * (*dvp); + tmp4 = *iyz2p + (*ixy2p) * (*dup) + (*iyy2p) * (*dvp); + n5 = (*ixx3p) * (*ixx3p) + (*ixy3p) * (*ixy3p) + dnorm; + n6 = (*iyy3p) * (*iyy3p) + (*ixy3p) * (*ixy3p) + dnorm; + tmp5 = *ixz3p + (*ixx3p) * (*dup) + (*ixy3p) * (*dvp); + tmp6 = *iyz3p + (*ixy3p) * (*dup) + (*iyy3p) * (*dvp); + tmp = (*maskp) * hgover3 / sqrtf( + tmp*tmp/n1 + tmp2*tmp2/n2 + tmp3*tmp3/n3 + tmp4*tmp4/n4 + tmp5*tmp5/n5 + tmp6*tmp6/n6 + epsgrad); + tmp6 = tmp/n6; tmp5 = tmp/n5; tmp4 = tmp/n4; tmp3 = tmp/n3; tmp2 = tmp/n2; tmp /= n1; + *a11p += tmp *(*ixx1p)*(*ixx1p) + tmp2*(*ixy1p)*(*ixy1p); + *a12p += tmp *(*ixx1p)*(*ixy1p) + tmp2*(*ixy1p)*(*iyy1p); + *a22p += tmp2*(*iyy1p)*(*iyy1p) + tmp *(*ixy1p)*(*ixy1p); + *b1p -= tmp *(*ixx1p)*(*ixz1p) + tmp2*(*ixy1p)*(*iyz1p); + *b2p -= tmp2*(*iyy1p)*(*iyz1p) + tmp *(*ixy1p)*(*ixz1p); + *a11p += tmp3*(*ixx2p)*(*ixx2p) + tmp4*(*ixy2p)*(*ixy2p); + *a12p += tmp3*(*ixx2p)*(*ixy2p) + tmp4*(*ixy2p)*(*iyy2p); + *a22p += tmp4*(*iyy2p)*(*iyy2p) + tmp3*(*ixy2p)*(*ixy2p); + *b1p -= tmp3*(*ixx2p)*(*ixz2p) + tmp4*(*ixy2p)*(*iyz2p); + *b2p -= tmp4*(*iyy2p)*(*iyz2p) + tmp3*(*ixy2p)*(*ixz2p); + *a11p += tmp5*(*ixx3p)*(*ixx3p) + tmp6*(*ixy3p)*(*ixy3p); + *a12p += tmp5*(*ixx3p)*(*ixy3p) + tmp6*(*ixy3p)*(*iyy3p); + *a22p += tmp6*(*iyy3p)*(*iyy3p) + tmp5*(*ixy3p)*(*ixy3p); + *b1p -= tmp5*(*ixx3p)*(*ixz3p) + tmp6*(*ixy3p)*(*iyz3p); + *b2p -= tmp6*(*iyy3p)*(*iyz3p) + tmp5*(*ixy3p)*(*ixz3p); + } + +} + +__global__ void kernelSubLaplacianVert( + float *src, float *nextSrc, + float *dst, float *nextDst, + float *weights, int height, int stride) { + + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + + if (tidx < stride) { + float *wvp = weights + tidx, + *srcp = src + tidx, + *srcp_s = nextSrc + tidx, + *dstp = dst + tidx, + *dstp_s = nextDst + tidx; + + for (int j = 0; j < height - 1; j++) { + float tmp = (*wvp) * ((*srcp_s)-(*srcp)); + *dstp += tmp; + *dstp_s -= tmp; + wvp += stride; srcp += stride; srcp_s += stride; dstp += stride; dstp_s += stride; + } + } + +} + +__global__ void kernelSubLaplacianHoriz( + float *src, float *dst, float *weights, float *coeffs, int height, int width, int stride) { + + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + int col = tidx % width; + + const int BLOCK_HEIGHT = 1; + + if (tidx < width) { + float *pSrc = src + tidx, + *pDst = dst + tidx, + *pWeight = weights + tidx, + *pCoeffCalc = coeffs + tidx, + *pCoeffUpdate = pCoeffCalc; + + int nBlocks = (height + BLOCK_HEIGHT - 1) / BLOCK_HEIGHT; + int jCalc = 0; + int jUpdate = 0; + + // Block calculation and update so coeffs fit in cache + + for (int iBlock = 0; iBlock < nBlocks; iBlock++) { + + // Calc coeffs + for (int j = 0; j < BLOCK_HEIGHT && jCalc < height; j++, jCalc++) { + // Do not calculate the last column + if (col != width - 1) + *pCoeffCalc = (*pWeight) * ( *(pSrc + 1) - *pSrc ); + + pSrc += stride; pWeight += stride; pCoeffCalc += stride; + } + + // Update dst + for (int j = 0; j < BLOCK_HEIGHT && jUpdate < height; j++, jUpdate++) { + float update = 0.0; + + if (col != 0) + update -= *(pCoeffUpdate - 1); + if (col != width - 1) + update += *pCoeffUpdate; + + *pDst += update; + + pDst += stride; pCoeffUpdate += stride; + } + } + } +} + +__global__ void kernelSubLaplacianHorizFillCoeffs( + float *src, float *weights, float *coeffs, int height, int width, int stride) { + + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + int row = tidx / stride; + int col = tidx % stride; + + // Do not calculate the last column + if (tidx < width && col != width - 1) { + float *pSrc = src + tidx, + *pWeight = weights + tidx, + *pCoeff = coeffs + tidx; + + for (int j = 0; j < height; j++) { + *pCoeff = (*pWeight) * ( *(pSrc + 1) - *pSrc ); + + pSrc += stride; pWeight += stride; pCoeff += stride; + } + } + + // // Do not calculate the last column + // if (col < width - 1) { + // float *pSrc = src + tidx, + // *pWeight = weights + tidx, + // *pCoeff = coeffs + tidx; + + // *pCoeff = (*pWeight) * ( *(pSrc + 1) - *pSrc ); + // } +} + +__global__ void kernelSubLaplacianHorizApplyCoeffs( + float *dst, float *coeffs, int height, int width, int stride) { + + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + int row = tidx / stride; + int col = tidx % stride; + + if (tidx < width) { + + float *pDst = dst + tidx, + *pCoeff = coeffs + tidx; + + for (int j = 0; j < height; j++) { + float update = 0.0; + + if (col != 0) + update -= *(pCoeff - 1); + if (col != width - 1) + update += *pCoeff; + + *pDst += update; + + pDst += stride; pCoeff += stride; + } + } + + // if (col < width) { + + // float *pDst = dst + tidx, + // *pCoeff = coeffs + tidx; + + // float update = 0.0; + + // if (col != 0) + // update -= *(pCoeff - 1); + // if (col != width - 1) + // update += *pCoeff; + + // *pDst += update; + // } +} + +__global__ void kernelSorStep( + float *du, float *dv, + float *a11, float *a12, float *a22, + const float *b1, const float *b2, + const float *horiz, const float *vert, + const int iterations, const float omega, + int height, int width, int stride, bool odd) { + + int tidx = blockIdx.x * blockDim.x + threadIdx.x; + int j = tidx / width; + int i = tidx % width; + + bool shouldRun = (odd) + ? ((i + j) % 2 == 1) + : ((i + j) % 2 == 0); + + if (tidx < width * height && shouldRun) { + + float sigma_u,sigma_v,sum_dpsis,A11,A22,A12,B1,B2; + sigma_u = 0.0f; + sigma_v = 0.0f; + sum_dpsis = 0.0f; + + int here = j * stride + i; + int left = j * stride + i - 1; + int right = j * stride + i + 1; + int up = (j-1) * stride + i; + int down = (j+1) * stride + i; + + if(j>0) + { + sigma_u -= vert[up] * du[up]; + sigma_v -= vert[up] * dv[up]; + sum_dpsis += vert[up]; + } + if(i>0) + { + sigma_u -= horiz[left] * du[left]; + sigma_v -= horiz[left] * dv[left]; + sum_dpsis += horiz[left]; + } + if(j= 0 && xx < width && yy >= 0 && yy < height); + + int x1 = MINMAX_TA(x, width); + int x2 = MINMAX_TA(x + 1, width); + int y1 = MINMAX_TA(y, height); + int y2 = MINMAX_TA(y + 1, height); + + dst1[offset] = + src1[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) + + src1[y1 * stride + x2] * dx * (1.0f-dy) + + src1[y2 * stride + x1] * (1.0f-dx) * dy + + src1[y2 * stride + x2] * dx * dy; + dst2[offset] = + src2[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) + + src2[y1 * stride + x2] * dx * (1.0f-dy) + + src2[y2 * stride + x1] * (1.0f-dx) * dy + + src2[y2 * stride + x2] * dx * dy; + dst3[offset] = + src3[y1 * stride + x1] * (1.0f-dx) * (1.0f-dy) + + src3[y1 * stride + x2] * dx * (1.0f-dy) + + src3[y2 * stride + x1] * (1.0f-dx) * dy + + src3[y2 * stride + x2] * dx * dy; + + } +} + + +namespace cu { + + void dataTerm( + image_t *a11, image_t *a12, image_t *a22, + image_t *b1, image_t *b2, + image_t *mask, + image_t *wx, image_t *wy, + image_t *du, image_t *dv, + image_t *uu, image_t *vv, + color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, + color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, + color_image_t *Ixz, color_image_t *Iyz, + const float half_delta_over3, const float half_beta, const float half_gamma_over3) { + + memset(a11->c1, 0, sizeof(float)*uu->height*uu->stride); + memset(a12->c1, 0, sizeof(float)*uu->height*uu->stride); + memset(a22->c1, 0, sizeof(float)*uu->height*uu->stride); + memset(b1->c1 , 0, sizeof(float)*uu->height*uu->stride); + memset(b2->c1 , 0, sizeof(float)*uu->height*uu->stride); + + // Set up device pointers + float *a11c1, + *a12c1, *a22c1, + *b1c1, *b2c1, + *maskc1, + *wxc1, *wyc1, + *duc1, *dvc1, + *uuc1, *vvc1, + *Ixc1, *Ixc2, *Ixc3, + *Iyc1, *Iyc2, *Iyc3, + *Izc1, *Izc2, *Izc3, + *Ixxc1, *Ixxc2, *Ixxc3, + *Ixyc1, *Ixyc2, *Ixyc3, + *Iyyc1, *Iyyc2, *Iyyc3, + *Ixzc1, *Ixzc2, *Ixzc3, + *Iyzc1, *Iyzc2, *Iyzc3; + + checkCudaErrors( cudaHostGetDevicePointer(&a11c1, a11->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&a12c1, a12->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&a22c1, a22->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&b1c1, b1->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&b2c1, b2->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&maskc1, mask->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&wxc1, wx->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&wyc1, wy->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&duc1, du->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&dvc1, dv->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&uuc1, uu->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&vvc1, vv->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixc1, Ix->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixc2, Ix->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixc3, Ix->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyc1, Iy->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyc2, Iy->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyc3, Iy->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Izc1, Iz->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Izc2, Iz->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Izc3, Iz->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixxc1, Ixx->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixxc2, Ixx->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixxc3, Ixx->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixyc1, Ixy->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixyc2, Ixy->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixyc3, Ixy->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyyc1, Iyy->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyyc2, Iyy->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyyc3, Iyy->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixzc1, Ixz->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixzc2, Ixz->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Ixzc3, Ixz->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyzc1, Iyz->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyzc2, Iyz->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&Iyzc3, Iyz->c3, 0) ); + + int N = uu->height*uu->stride; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + kernelDataTerm<<>>( + a11c1, a12c1, a22c1, + b1c1, b2c1, + maskc1, + wxc1, wyc1, + duc1, dvc1, + uuc1, vvc1, + Ixc1, Ixc2, Ixc3, + Iyc1, Iyc2, Iyc3, + Izc1, Izc2, Izc3, + Ixxc1, Ixxc2, Ixxc3, + Ixyc1, Ixyc2, Ixyc3, + Iyyc1, Iyyc2, Iyyc3, + Ixzc1, Ixzc2, Ixzc3, + Iyzc1, Iyzc2, Iyzc3, + half_delta_over3, half_beta, half_gamma_over3, N); + + }; + + + void subLaplacianHoriz( + float *src, float *dst, float *weights, int height, int width, int stride) { + + float *pDeviceCoeffs; + checkCudaErrors( cudaMalloc((void**) &pDeviceCoeffs, height * stride * sizeof(float)) ); + + // Setup device pointers + float *pDeviceSrc, *pDeviceDst, *pDeviceWeights; + checkCudaErrors( cudaHostGetDevicePointer(&pDeviceSrc, src, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&pDeviceDst, dst, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&pDeviceWeights, weights, 0) ); + + int N = width; + // int N = height * stride; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + auto start_horiz = now(); + + kernelSubLaplacianHorizFillCoeffs<<>>( + pDeviceSrc, pDeviceWeights, pDeviceCoeffs, height, width, stride); + + kernelSubLaplacianHorizApplyCoeffs<<>>( + pDeviceDst, pDeviceCoeffs, height, width, stride); + + // kernelSubLaplacianHoriz<<>>( + // pDeviceSrc, pDeviceDst, pDeviceWeights, pDeviceCoeffs, height, width, stride); + cudaDeviceSynchronize(); + calc_print_elapsed("laplacian horiz", start_horiz); + + cudaFree(pDeviceCoeffs); + } + + void subLaplacianVert( + float *src, float *dst, float *weights, int height, int stride) { + + float *d_src, *d_dst, *d_weights; + + checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_weights, weights, 0) ); + + int N = stride; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + auto start_vert = now(); + kernelSubLaplacianVert<<>>( + d_src, d_src + stride, d_dst, d_dst + stride, d_weights, height, stride); + calc_print_elapsed("laplacian vert", start_vert); + + } + + void sor( + float *du, float *dv, + float *a11, float *a12, float *a22, + float *b1, float *b2, + float *horiz, float *vert, + int iterations, float omega, + int height, int width, int stride) { + + // Device setup + float + *d_du, + *d_dv, + *d_a11, + *d_a12, + *d_a22, + *d_b1, + *d_b2, + *d_horiz, + *d_vert; + + checkCudaErrors( cudaHostGetDevicePointer(&d_du, du, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dv, dv, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_a11, a11, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_a12, a12, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_a22, a22, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_b1, b1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_b2, b2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_horiz, horiz, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_vert, vert, 0) ); + + int N = width * height; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + for(int iter = 0 ; iter>>( + d_du, d_dv, + d_a11, d_a12, d_a22, + d_b1, d_b2, + d_horiz, d_vert, + iterations, omega, + height, width, stride, true); + + cudaDeviceSynchronize(); + + kernelSorStep<<>>( + d_du, d_dv, + d_a11, d_a12, d_a22, + d_b1, d_b2, + d_horiz, d_vert, + iterations, omega, + height, width, stride, false); + } + } + + void getMeanImageAndDiff( + float *img1, float *img2, float *avgImg, float *diff, + int height, int stride) { + + float + *d_img1, + *d_img2, + *d_avgImg, + *d_diff; + + checkCudaErrors( cudaHostGetDevicePointer(&d_img1, img1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_img2, img2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_avgImg, avgImg, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_diff, diff, 0) ); + + int N = 3 * stride; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + kernelGetMeanImageAndDiff<<>>( + d_img1, d_img2, d_avgImg, d_diff, + height, stride); + + } + + void colorImageDerivative( + float *dst, float *src, float *pDeviceColorDerivativeKernel, + int height, int width, int stride, bool horiz) { + + float *d_dst, *d_src; + + checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) ); + + Npp32f *pDeviceSrc = d_src; + Npp32f *pDeviceDst = d_dst; + + size_t elemSize = sizeof(float); + unsigned int nSrcStep = stride * elemSize; + unsigned int nDstStep = nSrcStep; + + NppiSize oSrcSize = { width, height }; + NppiPoint oSrcOffset = { 0, 0 }; + NppiSize oSizeROI = { width, height }; + NppiBorderType eBorderType = NPP_BORDER_REPLICATE; + + NPP_CHECK_NPP( + (horiz) + ? nppiFilterRowBorder_32f_C1R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, + pDeviceColorDerivativeKernel, 5, 2, eBorderType) + : nppiFilterColumnBorder_32f_C1R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, + pDeviceColorDerivativeKernel, 5, 2, eBorderType) + ); + } + + // Expects filter kernel of the form + // { -0.5, 0.0, 0.5 } + void imageDerivative( + float *dst, float *src, float *pDeviceDerivativeKernel, + int height, int width, int stride, bool horiz) { + + float *d_dst, *d_src; + + checkCudaErrors( cudaHostGetDevicePointer(&d_dst, dst, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_src, src, 0) ); + + Npp32f *pDeviceSrc = d_src; + Npp32f *pDeviceDst = d_dst; + + size_t elemSize = sizeof(float); + unsigned int nSrcStep = stride * elemSize; + unsigned int nDstStep = nSrcStep; + + NppiSize oSrcSize = { width, height }; + NppiPoint oSrcOffset = { 0, 0 }; + NppiSize oSizeROI = { width, height }; + NppiBorderType eBorderType = NPP_BORDER_REPLICATE; + + NPP_CHECK_NPP( + (horiz) + ? nppiFilterRowBorder_32f_C1R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, + pDeviceDerivativeKernel, 3, 1, eBorderType) + : nppiFilterColumnBorder_32f_C1R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, + pDeviceDerivativeKernel, 3, 1, eBorderType) + ); + } + + void smoothnessTerm( + float *dst_horiz, float *dst_vert, float *smoothness, + float *ux, float *uy, float *vx, float *vy, + float qa, float epsmooth, + int height, int width, int stride) { + + float *d_dst_horiz, *d_dst_vert, *d_smoothness, *d_ux, *d_uy, *d_vx, *d_vy; + + checkCudaErrors( cudaHostGetDevicePointer(&d_dst_horiz, dst_horiz, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dst_vert, dst_vert, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_smoothness, smoothness, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_ux, ux, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_uy, uy, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_vx, vx, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_vy, vy, 0) ); + + int N = height * stride; + int nThreadsPerBlock = 128; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + kernelFlowMag<<>> ( + d_smoothness, d_ux, d_uy, d_vx, d_vy, + qa, epsmooth, height, width, stride); + + cudaDeviceSynchronize(); + + kernelSmoothnessHorizVert<<< nBlocks, nThreadsPerBlock >>> ( + d_dst_horiz, d_dst_vert, d_smoothness, height, width, stride); + + cudaDeviceSynchronize(); + } + + + void flowUpdate( + float *uu, float *vv, float *wx, float *wy, float *du, float *dv, + int height, int width, int stride) { + + float *d_uu, + *d_vv, + *d_wx, + *d_wy, + *d_du, + *d_dv; + + checkCudaErrors( cudaHostGetDevicePointer(&d_uu, uu, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_vv, vv, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_wx, wx, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_wy, wy, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_du, du, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dv, dv, 0) ); + + int N = height * stride; + int nThreadsPerBlock = 128; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + kernelFlowUpdate<<< nBlocks, nThreadsPerBlock >>> ( + d_uu, d_vv, d_wx, d_wy, d_du, d_dv, + height, width, stride); + + } + + /* + Warp an image `src` into `dst` using warp vectors `wx`, `wy`. + Store `mask[i]` = 0 or 1 if pixel i goes outisde or inside image bounds. + */ + void warpImage( + color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy) { + + float *d_dst1, *d_dst2, *d_dst3; + float *d_src1, *d_src2, *d_src3; + float *d_mask, *d_wx, *d_wy; + + checkCudaErrors( cudaHostGetDevicePointer(&d_dst1, dst->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dst2, dst->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_dst3, dst->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_src1, src->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_src2, src->c2, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_src3, src->c3, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_mask, mask->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_wx, wx->c1, 0) ); + checkCudaErrors( cudaHostGetDevicePointer(&d_wy, wy->c1, 0) ); + + int N = src->height * src->stride; + int nThreadsPerBlock = 64; + int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + + kernelWarpImage<<< nBlocks, nThreadsPerBlock >>> ( + d_dst1, d_dst2, d_dst3, d_mask, + d_src1, d_src2, d_src3, + d_wx, d_wy, src->height, src->width, src->stride); + } + +} diff --git a/src/kernels/flowUtil.h b/src/kernels/flowUtil.h new file mode 100644 index 0000000..1781b03 --- /dev/null +++ b/src/kernels/flowUtil.h @@ -0,0 +1,77 @@ +#ifndef __KERNEL_FLOW_UTIL_H__ +#define __KERNEL_FLOW_UTIL_H__ + +// System +#include +#include +#include +#include + +// CUDA +#include +#include + +// Local +#include "../FDF1.0.1/image.h" +#include "../common/Exceptions.h" +#include "../common/timer.h" +#include "../sandbox/process.h" +#include "../patch.h" + +using namespace OFC; + +namespace cu { + + void dataTerm( + image_t *a11, image_t *a12, image_t *a22, + image_t *b1, image_t *b2, + image_t *mask, + image_t *wx, image_t *wy, + image_t *du, image_t *dv, + image_t *uu, image_t *vv, + color_image_t *Ix, color_image_t *Iy, color_image_t *Iz, + color_image_t *Ixx, color_image_t *Ixy, color_image_t *Iyy, + color_image_t *Ixz, color_image_t *Iyz, + const float half_delta_over3, const float half_beta, const float half_gamma_over3); + + void subLaplacianHoriz( + float *src, float *dst, float *weights, int height, int width, int stride); + + void subLaplacianVert( + float *src, float *dst, float *weights, int height, int stride); + + void sor( + float *du, float *dv, + float *a11, float *a12, float *a22, + float *b1, float *b2, + float *horiz, float *vert, + int iterations, float omega, + int height, int width, int stride); + + void getMeanImageAndDiff( + float *img1, float *img2, float *avgImg, float *diff, + int height, int stride); + + void colorImageDerivative( + float *dst, float *src, float *pDeviceKernel, int height, int width, int stride, bool horiz); + + void imageDerivative( + float *dst, float *src, float *pDeviceKernel, int height, int width, int stride, bool horiz); + + void smoothnessTerm( + float *dst_horiz, float *dst_vert, float *smoothness, + float *ux, float *uy, float *vx, float *vy, + float quarter_alpha, float epsilon_smooth, + int height, int width, int stride); + + void flowUpdate( + float *uu, float *vv, float *wx, float *wy, float *du, float *dv, + int height, int width, int stride); + + void warpImage( + color_image_t *dst, image_t *mask, const color_image_t *src, const image_t *wx, const image_t *wy); + +} + +#endif // end __KERNEL_FLOW_UTIL_H__ + diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp index 6bf988d..41d7b46 100644 --- a/src/refine_variational.cpp +++ b/src/refine_variational.cpp @@ -5,6 +5,12 @@ #include +// CUDA +#include +#include +#include "common/cuda_helper.h" +#include "kernels/flowUtil.h" + #include #include #include @@ -46,6 +52,18 @@ namespace OFC { float deriv_filter_flow[2] = {0.0f, -0.5f}; deriv_flow = convolution_new(1, deriv_filter_flow, 0); + float pHostColorDerivativeKernel[5] = { 1.0 / 12.0, 2.0 / 3.0, 0.0, -2.0 / 3.0, 1.0 / 12.0 }; + checkCudaErrors( cudaMalloc((void**) &pDeviceColorDerivativeKernel, 5 * sizeof(float)) ); + checkCudaErrors( + cudaMemcpy(pDeviceColorDerivativeKernel, pHostColorDerivativeKernel, + 5 * sizeof(float), cudaMemcpyHostToDevice) ); + + float pHostDerivativeKernel[3] = { 0.5, 0.0, -0.5 }; + checkCudaErrors( cudaMalloc((void**) &pDeviceDerivativeKernel, 3 * sizeof(float)) ); + checkCudaErrors( + cudaMemcpy(pDeviceDerivativeKernel, pHostDerivativeKernel, + 3 * sizeof(float), cudaMemcpyHostToDevice) ); + // copy flow initialization into FV structs static int noparam = 2; // Optical flow @@ -74,12 +92,12 @@ namespace OFC { copyimage(_I0, I0); copyimage(_I1, I1); - // calc_print_elapsed("refine: flow_sep", start_flow_sep); + calc_print_elapsed("refine: flow_sep", start_flow_sep); // Call solver auto start_solver = now(); RefLevelOF(flow_sep[0], flow_sep[1], I0, I1); - // calc_print_elapsed("RefLevelOF [total]", start_solver); + calc_print_elapsed("RefLevelOF [total]", start_solver); // Copy flow result back auto start_copy = now(); @@ -93,7 +111,7 @@ namespace OFC { } } - // calc_print_elapsed("refine: copy back", start_copy); + calc_print_elapsed("refine: copy back", start_copy); // free FV structs for (int i = 0; i < noparam; ++i ) @@ -152,23 +170,24 @@ namespace OFC { *Ix = color_image_new(width,height), *Iy = color_image_new(width,height), *Iz = color_image_new(width,height), // first order derivatives *Ixx = color_image_new(width,height), *Ixy = color_image_new(width,height), *Iyy = color_image_new(width,height), *Ixz = color_image_new(width,height), *Iyz = color_image_new(width,height); // second order derivatives - // calc_print_elapsed("RefLevelOF setup", start_setup); + calc_print_elapsed("RefLevelOF setup", start_setup); // warp second image auto start_image_warp = now(); - image_warp(w_im2, mask, im2, wx, wy); - // calc_print_elapsed("RefLevelOF image_warp", start_image_warp); + // image_warp(w_im2, mask, im2, wx, wy); + cu::warpImage(w_im2, mask, im2, wx, wy); + calc_print_elapsed("RefLevelOF image_warp", start_image_warp); // compute derivatives auto start_get_derivs = now(); - get_derivatives(im1, w_im2, deriv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz); - // calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs); + get_derivatives(im1, w_im2, pDeviceColorDerivativeKernel, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz); + calc_print_elapsed("RefLevelOF get_derivatives", start_get_derivs); // erase du and dv auto start_image_erase = now(); image_erase(du); image_erase(dv); - // calc_print_elapsed("RefLevelOF image_erase", start_image_erase); + calc_print_elapsed("RefLevelOF image_erase", start_image_erase); // initialize uu and vv memcpy(uu->c1,wx->c1,wx->stride*wx->height*sizeof(float)); @@ -180,46 +199,43 @@ namespace OFC { // compute robust function and system auto start_smooth = now(); - compute_smoothness(smooth_horiz, smooth_vert, uu, vv, deriv_flow, vr.tmp_quarter_alpha ); - // calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth); + compute_smoothness(smooth_horiz, smooth_vert, uu, vv, pDeviceDerivativeKernel, vr.tmp_quarter_alpha ); + calc_print_elapsed(("RefLevelOF " + iterStr + " smoothness").c_str(), start_smooth); auto start_data = now(); - compute_data(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); - // calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data); + // compute_data(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); + cu::dataTerm(a11, a12, a22, b1, b2, mask, wx, wy, du, dv, uu, vv, Ix, Iy, Iz, Ixx, Ixy, Iyy, Ixz, Iyz, vr.tmp_half_delta_over3, vr.tmp_half_beta, vr.tmp_half_gamma_over3); + calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data); auto start_lapalcian = now(); sub_laplacian(b1, wx, smooth_horiz, smooth_vert); sub_laplacian(b2, wy, smooth_horiz, smooth_vert); - // calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian); + calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian); // solve system // #ifdef WITH_OPENMP auto start_sor = now(); sor_coupled_slow_but_readable(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // slower but parallelized - // calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor); + calc_print_elapsed(("RefLevelOF " + iterStr + " sor").c_str(), start_sor); // #else // sor_coupled(du, dv, a11, a12, a22, b1, b2, smooth_horiz, smooth_vert, vr.solve_iter, vr.sor_omega); // #endif // update flow plus flow increment auto start_flow_update = now(); - int i; - v4sf *uup = (v4sf*) uu->c1, *vvp = (v4sf*) vv->c1, *wxp = (v4sf*) wx->c1, *wyp = (v4sf*) wy->c1, *dup = (v4sf*) du->c1, *dvp = (v4sf*) dv->c1; - for( i=0 ; ic1, vv->c1, wx->c1, wy->c1, du->c1, dv->c1, + height, width, stride); + calc_print_elapsed(("RefLevelOF " + iterStr + " flow update").c_str(), start_flow_update); - // calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration); + calc_print_elapsed(("RefLevelOF " + iterStr + " [total]").c_str(), start_iteration); } // add flow increment to current flow auto start_increment_flow = now(); memcpy(wx->c1,uu->c1,uu->stride*uu->height*sizeof(float)); memcpy(wy->c1,vv->c1,vv->stride*vv->height*sizeof(float)); - // calc_print_elapsed("RefLevelOF increment flow", start_increment_flow); + calc_print_elapsed("RefLevelOF increment flow", start_increment_flow); // free memory auto start_cleanup = now(); @@ -233,10 +249,13 @@ namespace OFC { color_image_delete(w_im2); color_image_delete(Ix); color_image_delete(Iy); color_image_delete(Iz); color_image_delete(Ixx); color_image_delete(Ixy); color_image_delete(Iyy); color_image_delete(Ixz); color_image_delete(Iyz); - // calc_print_elapsed("RefLevelOF cleanup", start_cleanup); + calc_print_elapsed("RefLevelOF cleanup", start_cleanup); } - VarRefClass::~VarRefClass() { } + VarRefClass::~VarRefClass() { + cudaFree(pDeviceColorDerivativeKernel); + cudaFree(pDeviceDerivativeKernel); + } } diff --git a/src/refine_variational.h b/src/refine_variational.h index 9f98d99..8495e55 100644 --- a/src/refine_variational.h +++ b/src/refine_variational.h @@ -52,6 +52,9 @@ namespace OFC { const img_params* i_params; const opt_params* op; + float *pDeviceColorDerivativeKernel; + float *pDeviceDerivativeKernel; + }; }