diff --git a/src/FDF1.0.1/solver.cpp b/src/FDF1.0.1/solver.cpp new file mode 100644 index 0000000..1f264b1 --- /dev/null +++ b/src/FDF1.0.1/solver.cpp @@ -0,0 +1,381 @@ +#include +#include +#include +#include +#include + +#include + +#include "image.h" +#include "solver.h" +#include "../kernels/flowUtil.h" + +// #include +// #if (VECTOR_WIDTH == 4) +// typedef float32x4_t v4sf; +// #else +// typedef float v4sf; +// #endif + +//THIS IS A SLOW VERSION BUT READABLE +//Perform n iterations of the sor_coupled algorithm +//du and dv are used as initial guesses +//The system form is the same as in opticalflow.c +void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega) +{ + + cu::sor(du->c1, dv->c1, a11->c1, a12->c1, a22->c1, + b1->c1, b2->c1, dpsis_horiz->c1, dpsis_vert->c1, + iterations, omega, du->height, du->width, du->stride); + +} + +// THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION +// the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal +// each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line +// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){ +// //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n"); +// +// if(du->width<2 || du->height<2 || iterations < 1){ +// sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); +// return; +// } +// +// const int stride = du->stride, width = du->width; +// const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1); +// int j,iter,i,k; +// float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); +// if(floatarray==NULL){ +// fprintf(stderr, "error in sor_coupled(): not enough memory\n"); +// exit(1); +// } +// float *f1 = floatarray; +// float *f2 = f1+stride; +// float *f3 = f2+stride; +// f1[0] = 0.0f; +// memset(&f1[width], 0, sizeof(float)*(stride-width)); +// memset(&f2[width-1], 0, sizeof(float)*(stride-width+1)); +// memset(&f3[width-1], 0, sizeof(float)*(stride-width+1)); +// +// { // first iteration +// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; +// float *du_ptr = du->c1, *dv_ptr = dv->c1; +// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); +// +// { // first iteration - first line +// +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// v4sf *vpt = (v4sf*) dpsis_vert->c1; +// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; +// +// for(j=iterheight;--j;){ // first iteration - middle lines +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// { // first iteration - last line +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // reverse 2x2 diagonal block +// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); +// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; +// const v4sf det = A11*A22 - (*a12p)*(*a12p); +// *a11p = A11/det; +// *a22p = A22/det; +// *a12p /= -det; +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// } +// +// +// +// for(iter=iterations;--iter;) // other iterations +// { +// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; +// float *du_ptr = du->c1, *dv_ptr = dv->c1; +// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); +// +// { // other iteration - first line +// +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// v4sf *vpt = (v4sf*) dpsis_vert->c1; +// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; +// +// for(j=iterheight;--j;) // other iteration - middle lines +// { +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++) +// { +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// for(i=iterline; --i;) +// { +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); +// for(k=0;k<4;k++) +// { +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// +// { // other iteration - last line +// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); +// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); +// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); +// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; +// +// { // left block +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); +// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); +// for(k=1;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// for(i=iterline;--i;){ +// // do one iteration +// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); +// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); +// for(k=0;k<4;k++){ +// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; +// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; +// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); +// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); +// } +// // increment pointer +// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; +// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; +// du_ptr += 4; dv_ptr += 4; +// } +// +// } +// } +// +// +// +// free(floatarray); +// +// } +