-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
1 changed file
with
381 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,381 @@ | ||
#include <stdio.h> | ||
#include <stdlib.h> | ||
#include <math.h> | ||
#include <string.h> | ||
#include <malloc.h> | ||
|
||
#include <omp.h> | ||
|
||
#include "image.h" | ||
#include "solver.h" | ||
#include "../kernels/flowUtil.h" | ||
|
||
// #include <arm_neon.h> | ||
// #if (VECTOR_WIDTH == 4) | ||
// typedef float32x4_t v4sf; | ||
// #else | ||
// typedef float v4sf; | ||
// #endif | ||
|
||
//THIS IS A SLOW VERSION BUT READABLE | ||
//Perform n iterations of the sor_coupled algorithm | ||
//du and dv are used as initial guesses | ||
//The system form is the same as in opticalflow.c | ||
void sor_coupled_slow_but_readable(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega) | ||
{ | ||
|
||
cu::sor(du->c1, dv->c1, a11->c1, a12->c1, a22->c1, | ||
b1->c1, b2->c1, dpsis_horiz->c1, dpsis_vert->c1, | ||
iterations, omega, du->height, du->width, du->stride); | ||
|
||
} | ||
|
||
// THIS IS A FASTER VERSION BUT UNREADABLE, ONLY OPTICAL FLOW WITHOUT OPENMP PARALLELIZATION | ||
// the first iteration is separated from the other to compute the inverse of the 2x2 block diagonal | ||
// each iteration is split in two first line / middle lines / last line, and the left block is computed separately on each line | ||
// void sor_coupled(image_t *du, image_t *dv, image_t *a11, image_t *a12, image_t *a22, const image_t *b1, const image_t *b2, const image_t *dpsis_horiz, const image_t *dpsis_vert, const int iterations, const float omega){ | ||
// //sor_coupled_slow(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); return; printf("test\n"); | ||
// | ||
// if(du->width<2 || du->height<2 || iterations < 1){ | ||
// sor_coupled_slow_but_readable(du,dv,a11,a12,a22,b1,b2,dpsis_horiz,dpsis_vert,iterations,omega); | ||
// return; | ||
// } | ||
// | ||
// const int stride = du->stride, width = du->width; | ||
// const int iterheight = du->height-1, iterline = (stride)/4, width_minus_1_sizeoffloat = sizeof(float)*(width-1); | ||
// int j,iter,i,k; | ||
// float *floatarray = (float*) memalign(16, stride*sizeof(float)*3); | ||
// if(floatarray==NULL){ | ||
// fprintf(stderr, "error in sor_coupled(): not enough memory\n"); | ||
// exit(1); | ||
// } | ||
// float *f1 = floatarray; | ||
// float *f2 = f1+stride; | ||
// float *f3 = f2+stride; | ||
// f1[0] = 0.0f; | ||
// memset(&f1[width], 0, sizeof(float)*(stride-width)); | ||
// memset(&f2[width-1], 0, sizeof(float)*(stride-width+1)); | ||
// memset(&f3[width-1], 0, sizeof(float)*(stride-width+1)); | ||
// | ||
// { // first iteration | ||
// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; | ||
// float *du_ptr = du->c1, *dv_ptr = dv->c1; | ||
// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); | ||
// | ||
// { // first iteration - first line | ||
// | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vp); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// for(i=iterline;--i;){ | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vp); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); | ||
// for(k=0;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// | ||
// v4sf *vpt = (v4sf*) dpsis_vert->c1; | ||
// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; | ||
// | ||
// for(j=iterheight;--j;){ // first iteration - middle lines | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// for(i=iterline;--i;){ | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vpt) + (*vp); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); | ||
// for(k=0;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// | ||
// { // first iteration - last line | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// for(i=iterline;--i;){ | ||
// // reverse 2x2 diagonal block | ||
// const v4sf dpsis = (*hpl) + (*hp) + (*vpt); | ||
// const v4sf A11 = (*a22p)+dpsis, A22 = (*a11p)+dpsis; | ||
// const v4sf det = A11*A22 - (*a12p)*(*a12p); | ||
// *a11p = A11/det; | ||
// *a22p = A22/det; | ||
// *a12p /= -det; | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); | ||
// for(k=0;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// } | ||
// | ||
// | ||
// | ||
// for(iter=iterations;--iter;) // other iterations | ||
// { | ||
// v4sf *a11p = (v4sf*) a11->c1, *a12p = (v4sf*) a12->c1, *a22p = (v4sf*) a22->c1, *b1p = (v4sf*) b1->c1, *b2p = (v4sf*) b2->c1, *hp = (v4sf*) dpsis_horiz->c1, *vp = (v4sf*) dpsis_vert->c1; | ||
// float *du_ptr = du->c1, *dv_ptr = dv->c1; | ||
// v4sf *dub = (v4sf*) (du_ptr+stride), *dvb = (v4sf*) (dv_ptr+stride); | ||
// | ||
// { // other iteration - first line | ||
// | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// for(i=iterline;--i;){ | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vp)*(*dvb) + (*b2p); | ||
// for(k=0;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// | ||
// v4sf *vpt = (v4sf*) dpsis_vert->c1; | ||
// v4sf *dut = (v4sf*) du->c1, *dvt = (v4sf*) dv->c1; | ||
// | ||
// for(j=iterheight;--j;) // other iteration - middle lines | ||
// { | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++) | ||
// { | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// for(i=iterline; --i;) | ||
// { | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*vp)*(*dub) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*vp)*(*dvb) + (*b2p); | ||
// for(k=0;k<4;k++) | ||
// { | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; vp+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; dub+=1; dvb +=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// | ||
// { // other iteration - last line | ||
// memcpy(f1+1, ((float*) hp), width_minus_1_sizeoffloat); | ||
// memcpy(f2, du_ptr+1, width_minus_1_sizeoffloat); | ||
// memcpy(f3, dv_ptr+1, width_minus_1_sizeoffloat); | ||
// v4sf* hpl = (v4sf*) f1, *dur = (v4sf*) f2, *dvr = (v4sf*) f3; | ||
// | ||
// { // left block | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); | ||
// du_ptr[0] += omega*( a11p[0][0]*s1[0] + a12p[0][0]*s2[0] - du_ptr[0] ); | ||
// dv_ptr[0] += omega*( a12p[0][0]*s1[0] + a22p[0][0]*s2[0] - dv_ptr[0] ); | ||
// for(k=1;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// for(i=iterline;--i;){ | ||
// // do one iteration | ||
// const v4sf s1 = (*hp)*(*dur) + (*vpt)*(*dut) + (*b1p); | ||
// const v4sf s2 = (*hp)*(*dvr) + (*vpt)*(*dvt) + (*b2p); | ||
// for(k=0;k<4;k++){ | ||
// const float B1 = hpl[0][k]*du_ptr[k-1] + s1[k]; | ||
// const float B2 = hpl[0][k]*dv_ptr[k-1] + s2[k]; | ||
// du_ptr[k] += omega*( a11p[0][k]*B1 + a12p[0][k]*B2 - du_ptr[k] ); | ||
// dv_ptr[k] += omega*( a12p[0][k]*B1 + a22p[0][k]*B2 - dv_ptr[k] ); | ||
// } | ||
// // increment pointer | ||
// hpl+=1; hp+=1; vpt+=1; a11p+=1; a12p+=1; a22p+=1; | ||
// dur+=1; dvr+=1; dut+=1; dvt+=1; b1p+=1; b2p+=1; | ||
// du_ptr += 4; dv_ptr += 4; | ||
// } | ||
// | ||
// } | ||
// } | ||
// | ||
// | ||
// | ||
// free(floatarray); | ||
// | ||
// } | ||
|