Skip to content

Commit

Permalink
Finish pipeline - flows need to merged to reduce init and norm times
Browse files Browse the repository at this point in the history
  • Loading branch information
Ashwin Sekar committed May 12, 2017
1 parent 6909adf commit 018ff52
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 80 deletions.
30 changes: 13 additions & 17 deletions src/kernels/densify.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,18 +53,16 @@ __global__ void kernelDensifyPatch(

__global__ void kernelDensifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY,
dev_patch_state* states,
int width, int height,
int patch_size, float minErrVal) {

int patchId = blockIdx.x;
int tid = threadIdx.x;
if (!valid[patchId]) return;

int lower_bound = -patch_size / 2;
int xt = midpointX[patchId] + lower_bound;
int yt = midpointY[patchId] + lower_bound;
int xt = states[patchId].midpoint_orgx + lower_bound;
int yt = states[patchId].midpoint_orgy + lower_bound;
int offset = (xt + yt * width) + tid;

float* cost = costs[patchId];
Expand All @@ -82,8 +80,8 @@ __global__ void kernelDensifyPatches(
// Weight contribution RGB
atomicAdd(&weights[j], absw);

atomicAdd(&flow[2 * j], flowXs[patchId] * absw);
atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw);
atomicAdd(&flow[2 * j], states[patchId].p_curx * absw);
atomicAdd(&flow[2 * j + 1], states[patchId].p_cury * absw);
}

}
Expand All @@ -92,13 +90,13 @@ __global__ void kernelDensifyPatches(


__global__ void kernelNormalizeFlow(
float* pDeviceFlowOut, float* pDeviceWeights, int N) {
float* pDeviceFlowOut, float* pDeviceWeights, int N, int numBlocks) {

int i = blockIdx.x * blockDim.x + threadIdx.x;

if (i < N && pDeviceWeights[i] > 0) {
pDeviceFlowOut[2 * i] /= pDeviceWeights[i];
pDeviceFlowOut[2 * i + 1] /= pDeviceWeights[i];
for (; i < N; i+= blockDim.x * numBlocks) {
if (pDeviceWeights[i / 2] > 0)
pDeviceFlowOut[i] /= pDeviceWeights[i / 2];
}

}
Expand Down Expand Up @@ -127,24 +125,22 @@ namespace cu {
float* pDeviceFlowOut, float* pDeviceWeights, int N) {

int nThreadsPerBlock = 64;
int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock;
int nBlocks = 10;

kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N);
kernelNormalizeFlow<<<nBlocks, nThreadsPerBlock>>>(pDeviceFlowOut, pDeviceWeights, N, nBlocks);
}

void densifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY, int n_patches,
dev_patch_state* states, int n_patches,
const opt_params* op, const img_params* i_params) {

int nBlocks = n_patches;
int nThreadsPerBlock = op->patch_size;

kernelDensifyPatches<<<nBlocks, nThreadsPerBlock>>>(
costs, flow, weights,
flowXs, flowYs, valid,
midpointX, midpointY,
states,
i_params->width, i_params->height,
op->patch_size, op->min_errval);

Expand Down
3 changes: 1 addition & 2 deletions src/kernels/densify.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,7 @@ namespace cu {

void densifyPatches(
float** costs, float* flow, float* weights,
float* flowXs, float* flowYs, bool* valid,
float* midpointX, float* midpointY, int n_patches,
dev_patch_state* states, int n_patches,
const opt_params* op, const img_params* i_params);


Expand Down
87 changes: 26 additions & 61 deletions src/patchgrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,11 @@ namespace OFC {
checkCudaErrors( cudaMemcpy(pDevicePatchStates, pHostDevicePatchStates,
n_patches * sizeof(dev_patch_state), cudaMemcpyHostToDevice) );

// Prev flow
int flow_size = i_params->width * i_params->height / 2;
checkCudaErrors(
cudaMalloc ((void**) &pDevFlowPrev, flow_size * sizeof(float)) );


// Hessian
H00 = new float[n_patches];
Expand All @@ -185,6 +190,7 @@ namespace OFC {
meanTime = 0.0;
extractTime = 0.0;
optiTime = 0.0;
coarseTime = 0.0;
}


Expand Down Expand Up @@ -223,6 +229,8 @@ namespace OFC {
delete pHostDeviceTempXY;
delete pHostDeviceTempYY;

cudaFree(pDevFlowPrev);

cudaFree(pDeviceH00);
cudaFree(pDeviceH01);
cudaFree(pDeviceH11);
Expand Down Expand Up @@ -280,57 +288,24 @@ namespace OFC {

void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) {

float * devFlowPrev;
int flow_size = i_params->width * i_params->height / 2;
checkCudaErrors(
cudaMalloc ((void**) &devFlowPrev, flow_size * sizeof(float)) );
checkCudaErrors( cudaMemcpy(devFlowPrev, flow_prev,
checkCudaErrors( cudaMemcpy(pDevFlowPrev, flow_prev,
flow_size * sizeof(float), cudaMemcpyHostToDevice) );

cu::initCoarserOF(devFlowPrev, pDevicePatchStates,
gettimeofday(&tv_start, nullptr);

cu::initCoarserOF(pDevFlowPrev, pDevicePatchStates,
n_patches, i_params);

gettimeofday(&tv_end, nullptr);
coarseTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;

}


void PatGridClass::AggregateFlowDense(float *flowout) {

/*bool isValid[n_patches];
float flowXs[n_patches];
float flowYs[n_patches];
float* costs[n_patches];
for (int i = 0; i < n_patches; i++) {
isValid[i] = patches[i]->IsValid();
flowXs[i] = (*(patches[i]->GetCurP()))[0];
flowYs[i] = (*(patches[i]->GetCurP()))[1];
costs[i] = patches[i]->GetDeviceCostDiffPtr();
}
bool *deviceIsValid;
float* deviceFlowXs, * deviceFlowYs;
float** deviceCosts;
checkCudaErrors(
cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) );
checkCudaErrors(
cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) );
checkCudaErrors( cudaMemcpy(deviceIsValid, isValid,
n_patches * sizeof(bool), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs,
n_patches * sizeof(float), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs,
n_patches * sizeof(float), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(deviceCosts, costs,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );*/


gettimeofday(&tv_start, nullptr);

// Device mem
Expand All @@ -339,39 +314,28 @@ namespace OFC {
checkCudaErrors(
cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) );

/*cu::densifyPatches(
deviceCosts, pDeviceFlowOut, pDeviceWeights,
deviceFlowXs, deviceFlowYs, deviceIsValid,
pDeviceMidpointX, pDeviceMidpointY, n_patches,
op, i_params);*/
for (int ip = 0; ip < n_patches; ++ip) {

float* pweight = pHostDeviceCosts[ip]; // use image error as weight

cu::densifyPatch(
pweight, pDeviceFlowOut, pDeviceWeights,
pDevicePatchStates, ip,
midpoints_ref[ip][0], midpoints_ref[ip][1],
i_params->width, i_params->height,
op->patch_size, op->min_errval);

}
cu::densifyPatches(
pDeviceCosts, pDeviceFlowOut, pDeviceWeights,
pDevicePatchStates, n_patches, op, i_params);

gettimeofday(&tv_end, nullptr);
aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;

gettimeofday(&tv_start, nullptr);

// Normalize all pixels
cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, i_params->width * i_params->height);
cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, 2 * i_params->width * i_params->height);

gettimeofday(&tv_end, nullptr);
meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;

checkCudaErrors(
cudaMemcpy(flowout, pDeviceFlowOut,
i_params->width * i_params->height * 2 * sizeof(float), cudaMemcpyDeviceToHost) );

gettimeofday(&tv_end, nullptr);
meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;

}


Expand All @@ -380,6 +344,7 @@ namespace OFC {
cout << endl;
cout << "===============Timings (ms)===============" << endl;
cout << "[extract] " << extractTime << endl;
cout << "[coarse] " << coarseTime << endl;
cout << "[optiTime] " << optiTime << endl;
cout << "[aggregate] " << aggregateTime << endl;
cout << "[flow norm] " << meanTime << endl;
Expand Down
4 changes: 4 additions & 0 deletions src/patchgrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ namespace OFC {
float** pDeviceRaws, **pDeviceCosts;
float** pHostDeviceRaws, **pHostDeviceCosts;

// Previous flow
float* pDevFlowPrev;

// Hessian
// TODO: Can we shared memory?
float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY;
Expand All @@ -72,6 +75,7 @@ namespace OFC {
double aggregateTime;
double meanTime;
double extractTime;
double coarseTime;
double optiTime;

// float* midpointX_host;
Expand Down

0 comments on commit 018ff52

Please sign in to comment.