diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 6c95cdf..4ffc473 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -79,12 +79,12 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3") target_link_libraries(flow ${OpenCV_LIBS}) # CUDA sandbox -set(SANDBOX_FILES - # sandbox/process_sobel.cpp - # sandbox/process_resize.cpp - # sandbox/process_resizeGrad.cpp - sandbox/process_pad.cpp - # sandbox/RgbMatTest.cpp - sandbox/sandbox.cpp) -cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) -target_link_libraries(sandbox ${OpenCV_LIBS}) +# set(SANDBOX_FILES +# # sandbox/process_sobel.cpp +# # sandbox/process_resize.cpp +# # sandbox/process_resizeGrad.cpp +# sandbox/process_pad.cpp +# # sandbox/RgbMatTest.cpp +# sandbox/sandbox.cpp) +# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) +# target_link_libraries(sandbox ${OpenCV_LIBS}) diff --git a/src/kernels/densify.cu b/src/kernels/densify.cu index 2d413cc..e33767f 100644 --- a/src/kernels/densify.cu +++ b/src/kernels/densify.cu @@ -48,6 +48,47 @@ __global__ void kernelDensifyPatch( } + +__global__ void kernelDensifyPatches( + float** costs, float* flow, float* weights, + float* flowXs, float* flowYs, bool* valid, + float* midpointX, float* midpointY, + int width, int height, + int patch_size, float minErrVal) { + + int patchId = blockIdx.x; + int tid = threadIdx.x; + if (!valid[patchId]) return; + + int lower_bound = -patch_size / 2; + int xt = midpointX[patchId] + lower_bound; + int yt = midpointY[patchId] + lower_bound; + int offset = (xt + yt * width) + tid; + + float* cost = costs[patchId]; + + for (int i = 3 * tid, j = offset; i < patch_size * patch_size * 3; + i += 3 * patch_size, j += width) { + + if (j >= 0 && j < width * height) { + + float absw = (float) (fmaxf(minErrVal, cost[i])); + absw += (float) (fmaxf(minErrVal, cost[i + 1])); + absw += (float) (fmaxf(minErrVal, cost[i + 2])); + absw = 1.0 / absw; + + // Weight contribution RGB + atomicAdd(&weights[j], absw); + + atomicAdd(&flow[2 * j], flowXs[patchId] * absw); + atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw); + } + + } + +} + + __global__ void kernelNormalizeFlow( float* pDeviceFlowOut, float* pDeviceWeights, int N) { @@ -89,4 +130,22 @@ namespace cu { kernelNormalizeFlow<<>>(pDeviceFlowOut, pDeviceWeights, N); } + void densifyPatches( + float** costs, float* flow, float* weights, + float* flowXs, float* flowYs, bool* valid, + float* midpointX, float* midpointY, int n_patches, + const opt_params* op, const img_params* i_params) { + + int nBlocks = n_patches; + int nThreadsPerBlock = op->patch_size; + + kernelDensifyPatches<<>>( + costs, flow, weights, + flowXs, flowYs, valid, + midpointX, midpointY, + i_params->width, i_params->height, + op->patch_size, op->min_errval); + + } + } diff --git a/src/kernels/densify.h b/src/kernels/densify.h index 8b705cf..b755050 100644 --- a/src/kernels/densify.h +++ b/src/kernels/densify.h @@ -19,6 +19,9 @@ #include "../common/Exceptions.h" #include "../common/timer.h" #include "../sandbox/process.h" +#include "../patch.h" + +using namespace OFC; namespace cu { @@ -29,6 +32,14 @@ namespace cu { int width, int height, int patchSize, float minErrVal); + + void densifyPatches( + float** costs, float* flow, float* weights, + float* flowXs, float* flowYs, bool* valid, + float* midpointX, float* midpointY, int n_patches, + const opt_params* op, const img_params* i_params); + + void normalizeFlow( float* pDeviceFlowOut, float* pDeviceWeights, int N); diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp index 47b0363..ad2c216 100644 --- a/src/patchgrid.cpp +++ b/src/patchgrid.cpp @@ -263,14 +263,55 @@ namespace OFC { void PatGridClass::AggregateFlowDense(float *flowout) { + bool isValid[n_patches]; + float flowXs[n_patches]; + float flowYs[n_patches]; + float* costs[n_patches]; + + for (int i = 0; i < n_patches; i++) { + isValid[i] = patches[i]->IsValid(); + flowXs[i] = (*(patches[i]->GetCurP()))[0]; + flowYs[i] = (*(patches[i]->GetCurP()))[1]; + costs[i] = patches[i]->GetDeviceCostDiffPtr(); + } + + bool *deviceIsValid; + float* deviceFlowXs, * deviceFlowYs; + float** deviceCosts; + + checkCudaErrors( + cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) ); + checkCudaErrors( + cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) ); + checkCudaErrors( + cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) ); + checkCudaErrors( + cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) ); + + checkCudaErrors( cudaMemcpy(deviceIsValid, isValid, + n_patches * sizeof(bool), cudaMemcpyHostToDevice) ); + checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs, + n_patches * sizeof(float), cudaMemcpyHostToDevice) ); + checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs, + n_patches * sizeof(float), cudaMemcpyHostToDevice) ); + checkCudaErrors( cudaMemcpy(deviceCosts, costs, + n_patches * sizeof(float*), cudaMemcpyHostToDevice) ); + + gettimeofday(&tv_start, nullptr); + // Device mem checkCudaErrors( cudaMemset (pDeviceWeights, 0.0, i_params->width * i_params->height * sizeof(float)) ); checkCudaErrors( cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) ); - for (int ip = 0; ip < n_patches; ++ip) { + cu::densifyPatches( + deviceCosts, pDeviceFlowOut, pDeviceWeights, + deviceFlowXs, deviceFlowYs, deviceIsValid, + pDeviceMidpointX, pDeviceMidpointY, n_patches, + op, i_params); + /*for (int ip = 0; ip < n_patches; ++ip) { if (patches[ip]->IsValid()) { const Eigen::Vector2f* fl = patches[ip]->GetCurP(); // flow displacement of this patch @@ -285,7 +326,7 @@ namespace OFC { op->patch_size, op->min_errval); } - } + }*/ gettimeofday(&tv_end, nullptr); aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + diff --git a/src/timing/batch_agg.time b/src/timing/batch_agg.time new file mode 100644 index 0000000..c322d90 --- /dev/null +++ b/src/timing/batch_agg.time @@ -0,0 +1,103 @@ +[start] warmup: processing 720x480 image +[time] 647.278 (ms) : cudaMalloc +[time] 0.135 (ms) : cudaMemcpy H->D +[time] 227.372 (ms) : warmup +[time] 0.108 (ms) : cudaMemcpy H<-D +[done] warmup: primary compute time: 227.372 (ms) +[time] 0.419 (ms) : I0, I1 cudaMalloc +[time] 1.004 (ms) : cudaMemcpy I0, I1 H->D +[start] pad: processing 1024x436 image +[time] 0.226 (ms) : cudaMalloc +[time] 0.025 (ms) : pad +[done] pad: primary compute time: 0.025 (ms) +[start] pad: processing 1024x436 image +[time] 0.205 (ms) : cudaMalloc +[time] 0.012 (ms) : pad +[done] pad: primary compute time: 0.012 (ms) +TIME (Image loading ) (ms): 24.776 +TIME (Grid Memo. Alloc. ) (ms): 47.66 +TIME (Setup) (ms): 208.829 +I0 448x1024 +Constructing pyramids +[start] constructImgPyramids: processing 1024x448 image +[time] 1.463 (ms) : sobel: Ixs[0] +[time] 0.308 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.275 (ms) : sobel: Ixs[i] +[time] 0.273 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.281 (ms) : sobel: Ixs[i] +[time] 0.277 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.273 (ms) : sobel: Ixs[i] +[time] 0.276 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.286 (ms) : sobel: Ixs[i] +[time] 0.274 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.276 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +[time] 4.772 (ms) : total time +[done] constructImgPyramids: primmary compute time: 4.541 +[start] constructImgPyramids: processing 1024x448 image +[time] 0.279 (ms) : sobel: Ixs[0] +[time] 0.274 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.28 (ms) : sobel: Ixs[i] +[time] 0.275 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.275 (ms) : sobel: Ixs[i] +[time] 0.274 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.285 (ms) : sobel: Ixs[i] +[time] 0.277 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.276 (ms) : sobel: Ixs[i] +[time] 0.277 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.307 (ms) : sobel: Ixs[i] +[time] 0.273 (ms) : sobel: Iys[i] +[time] 3.534 (ms) : total time +[done] constructImgPyramids: primmary compute time: 3.352 +TIME (Pyramids+Gradients) (ms): 8.32 +, cflow +TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 0.09 0.00 31.70 0.10 0.25 -> 32.13 ms. +TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 0.07 0.00 110.32 0.09 0.65 -> 111.12 ms. +TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 0.11 0.00 425.12 0.12 2.16 -> 427.51 ms. +TIME (O.Flow Run-Time ) (ms): 570.787 + +===============Timings (ms)=============== +Avg grad descent iterations: 12.9978 +[hessian] 0 tot => -nan avg +[project] 149.146 tot => 0.0277481 avg +[cost] 119.47 tot => 0.0205169 avg +[interpolate] 37.362 tot => 0.00641628 avg +[mean norm] 105.888 tot => 0.0181844 avg +[extract] 0.074 +[aggregate] 0.011 +[flow norm] 0.04 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 0 tot => -nan avg +[project] 38.76 tot => 0.0288393 avg +[cost] 31.095 tot => 0.0213565 avg +[interpolate] 9.248 tot => 0.00635165 avg +[mean norm] 27.84 tot => 0.0191209 avg +[extract] 0.057 +[aggregate] 0.011 +[flow norm] 0.017 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 0 tot => -nan avg +[project] 11.16 tot => 0.0290625 avg +[cost] 8.943 tot => 0.0214976 avg +[interpolate] 2.684 tot => 0.00645192 avg +[mean norm] 7.954 tot => 0.0191202 avg +[extract] 0.08 +[aggregate] 0.019 +[flow norm] 0.012 +========================================== diff --git a/src/timing/batch_hessian_too.time b/src/timing/batch_hessian_too.time index cda9202..f7f5659 100644 --- a/src/timing/batch_hessian_too.time +++ b/src/timing/batch_hessian_too.time @@ -1,103 +1,103 @@ [start] warmup: processing 720x480 image -[time] 665.115 (ms) : cudaMalloc -[time] 0.11 (ms) : cudaMemcpy H->D -[time] 216.189 (ms) : warmup -[time] 0.221 (ms) : cudaMemcpy H<-D -[done] warmup: primary compute time: 216.189 (ms) -[time] 0.424 (ms) : I0, I1 cudaMalloc -[time] 0.993 (ms) : cudaMemcpy I0, I1 H->D +[time] 686.119 (ms) : cudaMalloc +[time] 0.106 (ms) : cudaMemcpy H->D +[time] 242.681 (ms) : warmup +[time] 0.099 (ms) : cudaMemcpy H<-D +[done] warmup: primary compute time: 242.681 (ms) +[time] 0.476 (ms) : I0, I1 cudaMalloc +[time] 1.008 (ms) : cudaMemcpy I0, I1 H->D [start] pad: processing 1024x436 image -[time] 0.219 (ms) : cudaMalloc -[time] 0.026 (ms) : pad -[done] pad: primary compute time: 0.026 (ms) +[time] 0.213 (ms) : cudaMalloc +[time] 0.021 (ms) : pad +[done] pad: primary compute time: 0.021 (ms) [start] pad: processing 1024x436 image -[time] 0.202 (ms) : cudaMalloc -[time] 0.012 (ms) : pad -[done] pad: primary compute time: 0.012 (ms) -TIME (Image loading ) (ms): 25.133 -TIME (Grid Memo. Alloc. ) (ms): 47.809 -TIME (Setup) (ms): 207.694 +[time] 0.204 (ms) : cudaMalloc +[time] 0.01 (ms) : pad +[done] pad: primary compute time: 0.01 (ms) +TIME (Image loading ) (ms): 27.769 +TIME (Grid Memo. Alloc. ) (ms): 49.605 +TIME (Setup) (ms): 227.645 I0 448x1024 Constructing pyramids [start] constructImgPyramids: processing 1024x448 image -[time] 1.728 (ms) : sobel: Ixs[0] -[time] 0.327 (ms) : sobel: Iys[0] +[time] 1.73 (ms) : sobel: Ixs[0] +[time] 0.283 (ms) : sobel: Iys[0] constructImgPyramids level 1: 512x224 -[time] 0.279 (ms) : sobel: Ixs[i] -[time] 0.276 (ms) : sobel: Iys[i] +[time] 0.294 (ms) : sobel: Ixs[i] +[time] 0.283 (ms) : sobel: Iys[i] constructImgPyramids level 2: 256x112 -[time] 0.281 (ms) : sobel: Ixs[i] -[time] 0.268 (ms) : sobel: Iys[i] +[time] 0.275 (ms) : sobel: Ixs[i] +[time] 0.289 (ms) : sobel: Iys[i] constructImgPyramids level 3: 128x56 -[time] 0.268 (ms) : sobel: Ixs[i] -[time] 0.273 (ms) : sobel: Iys[i] +[time] 0.274 (ms) : sobel: Ixs[i] +[time] 0.285 (ms) : sobel: Iys[i] constructImgPyramids level 4: 64x28 -[time] 0.267 (ms) : sobel: Ixs[i] -[time] 0.28 (ms) : sobel: Iys[i] +[time] 0.276 (ms) : sobel: Ixs[i] +[time] 0.288 (ms) : sobel: Iys[i] constructImgPyramids level 5: 32x14 -[time] 0.268 (ms) : sobel: Ixs[i] -[time] 0.275 (ms) : sobel: Iys[i] -[time] 4.991 (ms) : total time -[done] constructImgPyramids: primmary compute time: 4.79 +[time] 0.274 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +[time] 5.011 (ms) : total time +[done] constructImgPyramids: primmary compute time: 4.83 [start] constructImgPyramids: processing 1024x448 image -[time] 0.265 (ms) : sobel: Ixs[0] -[time] 0.267 (ms) : sobel: Iys[0] +[time] 0.282 (ms) : sobel: Ixs[0] +[time] 0.276 (ms) : sobel: Iys[0] constructImgPyramids level 1: 512x224 -[time] 0.272 (ms) : sobel: Ixs[i] -[time] 0.271 (ms) : sobel: Iys[i] +[time] 0.278 (ms) : sobel: Ixs[i] +[time] 0.284 (ms) : sobel: Iys[i] constructImgPyramids level 2: 256x112 -[time] 0.266 (ms) : sobel: Ixs[i] -[time] 0.266 (ms) : sobel: Iys[i] +[time] 0.277 (ms) : sobel: Ixs[i] +[time] 0.274 (ms) : sobel: Iys[i] constructImgPyramids level 3: 128x56 -[time] 0.272 (ms) : sobel: Ixs[i] -[time] 0.265 (ms) : sobel: Iys[i] +[time] 0.279 (ms) : sobel: Ixs[i] +[time] 0.282 (ms) : sobel: Iys[i] constructImgPyramids level 4: 64x28 -[time] 0.266 (ms) : sobel: Ixs[i] -[time] 0.266 (ms) : sobel: Iys[i] +[time] 0.277 (ms) : sobel: Ixs[i] +[time] 0.274 (ms) : sobel: Iys[i] constructImgPyramids level 5: 32x14 -[time] 0.274 (ms) : sobel: Ixs[i] -[time] 0.265 (ms) : sobel: Iys[i] -[time] 3.341 (ms) : total time -[done] constructImgPyramids: primmary compute time: 3.215 -TIME (Pyramids+Gradients) (ms): 8.342 +[time] 0.286 (ms) : sobel: Ixs[i] +[time] 0.277 (ms) : sobel: Iys[i] +[time] 3.479 (ms) : total time +[done] constructImgPyramids: primmary compute time: 3.346 +TIME (Pyramids+Gradients) (ms): 8.501 , cflow -TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 0.08 0.00 32.09 0.12 0.26 -> 32.54 ms. -TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 0.07 0.00 112.14 0.35 0.66 -> 113.21 ms. -TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 0.10 0.00 436.45 1.41 2.18 -> 440.14 ms. -TIME (O.Flow Run-Time ) (ms): 585.922 +TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 0.08 0.00 34.25 0.14 0.29 -> 34.76 ms. +TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 0.07 0.00 120.03 0.39 0.74 -> 121.24 ms. +TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 0.11 0.00 440.06 1.35 2.11 -> 443.63 ms. +TIME (O.Flow Run-Time ) (ms): 599.655 ===============Timings (ms)=============== Avg grad descent iterations: 12.9933 [hessian] 0 tot => -nan avg -[project] 153.341 tot => 0.0285392 avg -[cost] 122.301 tot => 0.0210103 avg -[interpolate] 38.586 tot => 0.00662876 avg -[mean norm] 108.655 tot => 0.018666 avg -[extract] 0.074 -[aggregate] 1.372 -[flow norm] 0.04 +[project] 154.088 tot => 0.0286782 avg +[cost] 122.006 tot => 0.0209596 avg +[interpolate] 40.805 tot => 0.00700997 avg +[mean norm] 109.308 tot => 0.0187782 avg +[extract] 0.078 +[aggregate] 1.304 +[flow norm] 0.038 ========================================== ===============Timings (ms)=============== Avg grad descent iterations: 13 [hessian] 0 tot => -nan avg -[project] 39.555 tot => 0.0294308 avg -[cost] 31.504 tot => 0.0216374 avg -[interpolate] 9.531 tot => 0.00654602 avg -[mean norm] 28.148 tot => 0.0193324 avg -[extract] 0.056 -[aggregate] 0.328 -[flow norm] 0.017 +[project] 41.891 tot => 0.0311689 avg +[cost] 33.433 tot => 0.0229622 avg +[interpolate] 11.077 tot => 0.00760783 avg +[mean norm] 30.095 tot => 0.0206696 avg +[extract] 0.059 +[aggregate] 0.378 +[flow norm] 0.015 ========================================== ===============Timings (ms)=============== Avg grad descent iterations: 13 [hessian] 0 tot => -nan avg -[project] 11.309 tot => 0.0294505 avg -[cost] 8.998 tot => 0.0216298 avg -[interpolate] 2.746 tot => 0.00660096 avg -[mean norm] 8.051 tot => 0.0193534 avg -[extract] 0.073 -[aggregate] 0.108 +[project] 11.963 tot => 0.0311536 avg +[cost] 9.552 tot => 0.0229615 avg +[interpolate] 3.159 tot => 0.00759375 avg +[mean norm] 8.55 tot => 0.0205529 avg +[extract] 0.075 +[aggregate] 0.121 [flow norm] 0.013 ==========================================