diff --git a/src/kernels/flowUtil.cu b/src/kernels/flowUtil.cu index 7af150e..20e92a7 100644 --- a/src/kernels/flowUtil.cu +++ b/src/kernels/flowUtil.cu @@ -227,7 +227,6 @@ __global__ void kernelSubLaplacianHorizFillCoeffs( float *src, float *weights, float *coeffs, int height, int width, int stride) { int tidx = blockIdx.x * blockDim.x + threadIdx.x; - int row = tidx / stride; int col = tidx % stride; // Do not calculate the last column @@ -591,12 +590,18 @@ namespace cu { }; + void subLaplacian( + image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert, float *coeffs) { + + cu::subLaplacianHoriz(src->c1, dst->c1, weight_horiz->c1, coeffs, src->height, src->width, src->stride); + cu::subLaplacianVert(src->c1, dst->c1, weight_vert->c1, src->height, src->stride); + + } void subLaplacianHoriz( - float *src, float *dst, float *weights, int height, int width, int stride) { + float *src, float *dst, float *weights, float *coeffs, int height, int width, int stride) { - float *pDeviceCoeffs; - checkCudaErrors( cudaMalloc((void**) &pDeviceCoeffs, height * stride * sizeof(float)) ); + float *pDeviceCoeffs = coeffs; // Setup device pointers float *pDeviceSrc, *pDeviceDst, *pDeviceWeights; @@ -621,8 +626,6 @@ namespace cu { // pDeviceSrc, pDeviceDst, pDeviceWeights, pDeviceCoeffs, height, width, stride); cudaDeviceSynchronize(); calc_print_elapsed("laplacian horiz", start_horiz); - - cudaFree(pDeviceCoeffs); } void subLaplacianVert( @@ -950,9 +953,4 @@ namespace cu { color_image_delete(tmp_im2); } - void subLaplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert){ - cu::subLaplacianHoriz(src->c1, dst->c1, weight_horiz->c1, src->height, src->width, src->stride); - cu::subLaplacianVert(src->c1, dst->c1, weight_vert->c1, src->height, src->stride); - } - } diff --git a/src/kernels/flowUtil.h b/src/kernels/flowUtil.h index 7a74c4c..badff1b 100644 --- a/src/kernels/flowUtil.h +++ b/src/kernels/flowUtil.h @@ -34,8 +34,11 @@ namespace cu { color_image_t *Ixz, color_image_t *Iyz, const float half_delta_over3, const float half_beta, const float half_gamma_over3); + void subLaplacian( + image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert, float *coeffs); + void subLaplacianHoriz( - float *src, float *dst, float *weights, int height, int width, int stride); + float *src, float *dst, float *weights, float *coeffs, int height, int width, int stride); void subLaplacianVert( float *src, float *dst, float *weights, int height, int stride); @@ -79,8 +82,6 @@ namespace cu { color_image_t *dx, color_image_t *dy, color_image_t *dt, color_image_t *dxx, color_image_t *dxy, color_image_t *dyy, color_image_t *dxt, color_image_t *dyt); - void subLaplacian(image_t *dst, const image_t *src, const image_t *weight_horiz, const image_t *weight_vert); - } #endif // end __KERNEL_FLOW_UTIL_H__ diff --git a/src/refine_variational.cpp b/src/refine_variational.cpp index 8d35fb0..60d148a 100644 --- a/src/refine_variational.cpp +++ b/src/refine_variational.cpp @@ -80,6 +80,9 @@ namespace OFC { } } + checkCudaErrors( cudaMalloc((void**) &pDeviceSubLaplacianCoeffs, + flow_sep[0]->height * flow_sep[0]->stride * sizeof(float)) ); + // copy image data into FV structs color_image_t * I0, * I1; I0 = color_image_new(i_params->width, i_params->height); @@ -200,8 +203,8 @@ namespace OFC { calc_print_elapsed(("RefLevelOF " + iterStr + " data").c_str(), start_data); auto start_lapalcian = now(); - cu::subLaplacian(b1, wx, smooth_horiz, smooth_vert); - cu::subLaplacian(b2, wy, smooth_horiz, smooth_vert); + cu::subLaplacian(b1, wx, smooth_horiz, smooth_vert, pDeviceSubLaplacianCoeffs); + cu::subLaplacian(b2, wy, smooth_horiz, smooth_vert, pDeviceSubLaplacianCoeffs); calc_print_elapsed(("RefLevelOF " + iterStr + " laplacian").c_str(), start_lapalcian); // solve system @@ -246,6 +249,7 @@ namespace OFC { VarRefClass::~VarRefClass() { cudaFree(pDeviceColorDerivativeKernel); cudaFree(pDeviceDerivativeKernel); + cudaFree(pDeviceSubLaplacianCoeffs); } } diff --git a/src/refine_variational.h b/src/refine_variational.h index dcdb380..beaf67e 100644 --- a/src/refine_variational.h +++ b/src/refine_variational.h @@ -52,6 +52,7 @@ namespace OFC { float *pDeviceColorDerivativeKernel; float *pDeviceDerivativeKernel; + float *pDeviceSubLaplacianCoeffs; };