Skip to content

Commit

Permalink
Batch-fy aggregate flow. NOTE lost accuracy
Browse files Browse the repository at this point in the history
  • Loading branch information
Ashwin Sekar committed May 10, 2017
1 parent fc67d64 commit 21e559a
Show file tree
Hide file tree
Showing 6 changed files with 294 additions and 80 deletions.
18 changes: 9 additions & 9 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -79,12 +79,12 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "SELECTCHANNEL=3")
target_link_libraries(flow ${OpenCV_LIBS})

# CUDA sandbox
set(SANDBOX_FILES
# sandbox/process_sobel.cpp
# sandbox/process_resize.cpp
# sandbox/process_resizeGrad.cpp
sandbox/process_pad.cpp
# sandbox/RgbMatTest.cpp
sandbox/sandbox.cpp)
cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
target_link_libraries(sandbox ${OpenCV_LIBS})
# set(SANDBOX_FILES
# # sandbox/process_sobel.cpp
# # sandbox/process_resize.cpp
# # sandbox/process_resizeGrad.cpp
# sandbox/process_pad.cpp
# # sandbox/RgbMatTest.cpp
# sandbox/sandbox.cpp)
# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
# target_link_libraries(sandbox ${OpenCV_LIBS})
59 changes: 59 additions & 0 deletions src/kernels/densify.cu
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,47 @@ __global__ void kernelDensifyPatch(

}


__global__ void kernelDensifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY,
int width, int height,
int patch_size, float minErrVal) {

int patchId = blockIdx.x;
int tid = threadIdx.x;
if (!valid[patchId]) return;

int lower_bound = -patch_size / 2;
int xt = midpointX[patchId] + lower_bound;
int yt = midpointY[patchId] + lower_bound;
int offset = (xt + yt * width) + tid;

float* cost = costs[patchId];

for (int i = 3 * tid, j = offset; i < patch_size * patch_size * 3;
i += 3 * patch_size, j += width) {

if (j >= 0 && j < width * height) {

float absw = (float) (fmaxf(minErrVal, cost[i]));
absw += (float) (fmaxf(minErrVal, cost[i + 1]));
absw += (float) (fmaxf(minErrVal, cost[i + 2]));
absw = 1.0 / absw;

// Weight contribution RGB
atomicAdd(&weights[j], absw);

atomicAdd(&flow[2 * j], flowXs[patchId] * absw);
atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw);
}

}

}


__global__ void kernelNormalizeFlow(
float* pDeviceFlowOut, float* pDeviceWeights, int N) {

Expand Down Expand Up @@ -89,4 +130,22 @@ namespace cu {
kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N);
}

void densifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY, int n_patches,
const opt_params* op, const img_params* i_params) {

int nBlocks = n_patches;
int nThreadsPerBlock = op->patch_size;

kernelDensifyPatches<<<nBlocks, nThreadsPerBlock>>>(
costs, flow, weights,
flowXs, flowYs, valid,
midpointX, midpointY,
i_params->width, i_params->height,
op->patch_size, op->min_errval);

}

}
11 changes: 11 additions & 0 deletions src/kernels/densify.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "../common/Exceptions.h"
#include "../common/timer.h"
#include "../sandbox/process.h"
#include "../patch.h"

using namespace OFC;

namespace cu {

Expand All @@ -29,6 +32,14 @@ namespace cu {
int width, int height,
int patchSize, float minErrVal);


void densifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY, int n_patches,
const opt_params* op, const img_params* i_params);


void normalizeFlow(
float* pDeviceFlowOut, float* pDeviceWeights, int N);

Expand Down
45 changes: 43 additions & 2 deletions src/patchgrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -263,14 +263,55 @@ namespace OFC {

void PatGridClass::AggregateFlowDense(float *flowout) {

bool isValid[n_patches];
float flowXs[n_patches];
float flowYs[n_patches];
float* costs[n_patches];

for (int i = 0; i < n_patches; i++) {
isValid[i] = patches[i]->IsValid();
flowXs[i] = (*(patches[i]->GetCurP()))[0];
flowYs[i] = (*(patches[i]->GetCurP()))[1];
costs[i] = patches[i]->GetDeviceCostDiffPtr();
}

bool *deviceIsValid;
float* deviceFlowXs, * deviceFlowYs;
float** deviceCosts;

checkCudaErrors(
cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) );

checkCudaErrors( cudaMemcpy(deviceIsValid, isValid,
n_patches * sizeof(bool), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs,
n_patches * sizeof(float), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs,
n_patches * sizeof(float), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceCosts, costs,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );


gettimeofday(&tv_start, nullptr);

// Device mem
checkCudaErrors(
cudaMemset (pDeviceWeights, 0.0, i_params->width * i_params->height * sizeof(float)) );
checkCudaErrors(
cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) );

for (int ip = 0; ip < n_patches; ++ip) {
cu::densifyPatches(
deviceCosts, pDeviceFlowOut, pDeviceWeights,
deviceFlowXs, deviceFlowYs, deviceIsValid,
pDeviceMidpointX, pDeviceMidpointY, n_patches,
op, i_params);
/*for (int ip = 0; ip < n_patches; ++ip) {
if (patches[ip]->IsValid()) {
const Eigen::Vector2f* fl = patches[ip]->GetCurP(); // flow displacement of this patch
Expand All @@ -285,7 +326,7 @@ namespace OFC {
op->patch_size, op->min_errval);
}
}
}*/

gettimeofday(&tv_end, nullptr);
aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
Expand Down
103 changes: 103 additions & 0 deletions src/timing/batch_agg.time
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
[start] warmup: processing 720x480 image
[time] 647.278 (ms) : cudaMalloc
[time] 0.135 (ms) : cudaMemcpy H->D
[time] 227.372 (ms) : warmup
[time] 0.108 (ms) : cudaMemcpy H<-D
[done] warmup: primary compute time: 227.372 (ms)
[time] 0.419 (ms) : I0, I1 cudaMalloc
[time] 1.004 (ms) : cudaMemcpy I0, I1 H->D
[start] pad: processing 1024x436 image
[time] 0.226 (ms) : cudaMalloc
[time] 0.025 (ms) : pad
[done] pad: primary compute time: 0.025 (ms)
[start] pad: processing 1024x436 image
[time] 0.205 (ms) : cudaMalloc
[time] 0.012 (ms) : pad
[done] pad: primary compute time: 0.012 (ms)
TIME (Image loading ) (ms): 24.776
TIME (Grid Memo. Alloc. ) (ms): 47.66
TIME (Setup) (ms): 208.829
I0 448x1024
Constructing pyramids
[start] constructImgPyramids: processing 1024x448 image
[time] 1.463 (ms) : sobel: Ixs[0]
[time] 0.308 (ms) : sobel: Iys[0]
constructImgPyramids level 1: 512x224
[time] 0.275 (ms) : sobel: Ixs[i]
[time] 0.273 (ms) : sobel: Iys[i]
constructImgPyramids level 2: 256x112
[time] 0.281 (ms) : sobel: Ixs[i]
[time] 0.277 (ms) : sobel: Iys[i]
constructImgPyramids level 3: 128x56
[time] 0.273 (ms) : sobel: Ixs[i]
[time] 0.276 (ms) : sobel: Iys[i]
constructImgPyramids level 4: 64x28
[time] 0.286 (ms) : sobel: Ixs[i]
[time] 0.274 (ms) : sobel: Iys[i]
constructImgPyramids level 5: 32x14
[time] 0.276 (ms) : sobel: Ixs[i]
[time] 0.279 (ms) : sobel: Iys[i]
[time] 4.772 (ms) : total time
[done] constructImgPyramids: primmary compute time: 4.541
[start] constructImgPyramids: processing 1024x448 image
[time] 0.279 (ms) : sobel: Ixs[0]
[time] 0.274 (ms) : sobel: Iys[0]
constructImgPyramids level 1: 512x224
[time] 0.28 (ms) : sobel: Ixs[i]
[time] 0.275 (ms) : sobel: Iys[i]
constructImgPyramids level 2: 256x112
[time] 0.275 (ms) : sobel: Ixs[i]
[time] 0.274 (ms) : sobel: Iys[i]
constructImgPyramids level 3: 128x56
[time] 0.285 (ms) : sobel: Ixs[i]
[time] 0.277 (ms) : sobel: Iys[i]
constructImgPyramids level 4: 64x28
[time] 0.276 (ms) : sobel: Ixs[i]
[time] 0.277 (ms) : sobel: Iys[i]
constructImgPyramids level 5: 32x14
[time] 0.307 (ms) : sobel: Ixs[i]
[time] 0.273 (ms) : sobel: Iys[i]
[time] 3.534 (ms) : total time
[done] constructImgPyramids: primmary compute time: 3.352
TIME (Pyramids+Gradients) (ms): 8.32
, cflow
TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 0.09 0.00 31.70 0.10 0.25 -> 32.13 ms.
TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 0.07 0.00 110.32 0.09 0.65 -> 111.12 ms.
TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 0.11 0.00 425.12 0.12 2.16 -> 427.51 ms.
TIME (O.Flow Run-Time ) (ms): 570.787

===============Timings (ms)===============
Avg grad descent iterations: 12.9978
[hessian] 0 tot => -nan avg
[project] 149.146 tot => 0.0277481 avg
[cost] 119.47 tot => 0.0205169 avg
[interpolate] 37.362 tot => 0.00641628 avg
[mean norm] 105.888 tot => 0.0181844 avg
[extract] 0.074
[aggregate] 0.011
[flow norm] 0.04
==========================================

===============Timings (ms)===============
Avg grad descent iterations: 13
[hessian] 0 tot => -nan avg
[project] 38.76 tot => 0.0288393 avg
[cost] 31.095 tot => 0.0213565 avg
[interpolate] 9.248 tot => 0.00635165 avg
[mean norm] 27.84 tot => 0.0191209 avg
[extract] 0.057
[aggregate] 0.011
[flow norm] 0.017
==========================================

===============Timings (ms)===============
Avg grad descent iterations: 13
[hessian] 0 tot => -nan avg
[project] 11.16 tot => 0.0290625 avg
[cost] 8.943 tot => 0.0214976 avg
[interpolate] 2.684 tot => 0.00645192 avg
[mean norm] 7.954 tot => 0.0191202 avg
[extract] 0.08
[aggregate] 0.019
[flow norm] 0.012
==========================================
Loading

0 comments on commit 21e559a

Please sign in to comment.