Skip to content

Commit

Permalink
Batch-fy hessian also, construction time is now very fast < 10 ms
Browse files Browse the repository at this point in the history
  • Loading branch information
Ashwin Sekar committed May 10, 2017
1 parent 0491dcf commit fc67d64
Show file tree
Hide file tree
Showing 9 changed files with 470 additions and 16 deletions.
66 changes: 62 additions & 4 deletions src/kernels/extract.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,9 +33,11 @@ __global__ void kernelExtractPatch(
}


__global__ void kernelExtractPatches(
__global__ void kernelExtractPatchesAndHessians(
float** patches, float** patchxs, float** patchys,
const float * I0, const float * I0x, const float * I0y,
float* H00, float* H01, float* H11,
float** tempXX, float** tempXY, float** tempYY,
float* midpointX, float* midpointY, int padding,
int patch_size, int width_pad) {

Expand All @@ -45,6 +47,9 @@ __global__ void kernelExtractPatches(
float* patch = patches[patchId];
float* patchX = patchxs[patchId];
float* patchY = patchys[patchId];
float* XX = tempXX[patchId];
float* XY = tempXY[patchId];
float* YY = tempYY[patchId];

int x = round(midpointX[patchId]) + padding;
int y = round(midpointY[patchId]) + padding;
Expand All @@ -57,8 +62,57 @@ __global__ void kernelExtractPatches(
patch[i] = I0[j];
patchX[i] = I0x[j];
patchY[i] = I0y[j];
XX[i] = patchX[i] * patchX[i];
XY[i] = patchX[i] * patchY[i];
YY[i] = patchY[i] * patchY[i];
}

__syncthreads();

// Mean normalize
__shared__ float mean;

if (tid == 0) {

mean = 0.0;
for (int i = 0; i < patch_size * patch_size * 3; i++) {
mean += patch[i];
}
mean /= patch_size * patch_size * 3;

}

__syncthreads();

for (int i = tid; i < patch_size * patch_size * 3;
i+= 3 * patch_size) {
patch[i] -= mean;
}

// TODO: can this be done in parallel?
if (tid == 0) {

float h00 = 0.0, h01 = 0.0, h11 = 0.0;

for (int i = 0; i < patch_size * patch_size * 3; i++) {
h00 += XX[i];
h01 += XY[i];
h11 += YY[i];
}

// If not invertible adjust values
if (h00 * h11 - h01 * h01 == 0) {
h00 += 1e-10;
h11 += 1e-10;
}

H00[patchId] = h00;
H01[patchId] = h01;
H11[patchId] = h11;

}


}


Expand All @@ -80,17 +134,21 @@ namespace cu {
}


void extractPatches(float** patches, float** patchxs, float** patchys,
void extractPatchesAndHessians(
float** patches, float** patchxs, float** patchys,
const float * I0, const float * I0x, const float * I0y,
float* H00, float* H01, float* H11,
float** tempXX, float** tempXY, float** tempYY,
float* midpointX, float* midpointY, int n_patches,
const opt_params* op, const img_params* i_params) {

int nBlocks = n_patches;
int nThreadsPerBlock = 3 * op->patch_size;

kernelExtractPatches<<<nBlocks, nThreadsPerBlock>>>(
kernelExtractPatchesAndHessians<<<nBlocks, nThreadsPerBlock>>>(
patches, patchxs, patchys,
I0, I0x, I0y, midpointX, midpointY,
I0, I0x, I0y, H00, H01, H11,
tempXX, tempXY, tempYY, midpointX, midpointY,
i_params->padding, op->patch_size, i_params->width_pad);

}
Expand Down
5 changes: 4 additions & 1 deletion src/kernels/extract.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,11 @@ namespace cu {
const float* I0, const float* I0x, const float* I0y, int patch_offset,
int patch_size, int width_pad);

void extractPatches(float** patches, float** patchxs, float** patchys,
void extractPatchesAndHessians(
float** patches, float** patchxs, float** patchys,
const float * I0, const float * I0x, const float * I0y,
float* H00, float* H01, float* H11,
float** tempXX, float** tempXY, float** tempYY,
float* midpointX, float* midpointY, int n_patches,
const opt_params* op, const img_params* i_params);

Expand Down
22 changes: 16 additions & 6 deletions src/patch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ namespace OFC {
// void PatClass::InitializePatch(const float * _I0,
// const float * _I0x, const float * _I0y, const Eigen::Vector2f _midpoint) {
void PatClass::InitializePatch(float * _patch,
float * _patchx, float * _patchy,
float * _patchx, float * _patchy, float H00, float H01, float H11,
const Eigen::Vector2f _midpoint) {

// I0 = _I0;
Expand All @@ -106,14 +106,20 @@ namespace OFC {
midpoint = _midpoint;

ResetPatchState();
ExtractPatch();
ComputeHessian();

p_state->hessian(0,0) = H00;
p_state->hessian(0,1) = H01;
p_state->hessian(1,0) = p_state->hessian(0,1);
p_state->hessian(1,1) = H11;

//ExtractPatch();
// ComputeHessian(H00, H01, H11);

}

void PatClass::ComputeHessian() {
void PatClass::ComputeHessian(float H00, float H01, float H11) {

gettimeofday(&tv_start, nullptr);
/*gettimeofday(&tv_start, nullptr);
CUBLAS_CHECK (
cublasSdot(op->cublasHandle, patch.size(),
Expand All @@ -131,8 +137,12 @@ namespace OFC {
hessianTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;
hessianCalls++;
hessianCalls++;*/

p_state->hessian(0,0) = H00;
p_state->hessian(0,1) = H01;
p_state->hessian(1,0) = p_state->hessian(0,1);
p_state->hessian(1,1) = H11;

// If not invertible adjust values
if (p_state->hessian.determinant() == 0) {
Expand Down
3 changes: 2 additions & 1 deletion src/patch.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ namespace OFC {
// const float * _I0y, const Eigen::Vector2f _midpoint);
void InitializePatch(float * _patch,
float * _patchx, float* _patchy,
float H00, float H01, float H11,
const Eigen::Vector2f _midpoint);
void SetTargetImage(const float * _I1);

Expand Down Expand Up @@ -75,7 +76,7 @@ namespace OFC {
void OptimizeComputeErrImg();
void UpdateMidpoint();
void ResetPatchState();
void ComputeHessian();
void ComputeHessian(float H00, float H01, float H11);
void ComputeCostErr();

// Extract patch on integer position, and gradients, No Bilinear interpolation
Expand Down
73 changes: 69 additions & 4 deletions src/patchgrid.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,24 +85,43 @@ namespace OFC {
checkCudaErrors(
cudaMalloc ((void**) &pDeviceFlowOut, i_params->width * i_params->height * 2 * sizeof(float)) );

// Patches
// Patches and Hessians
checkCudaErrors(
cudaMalloc((void**) &pDevicePatches, n_patches * sizeof(float*)) );
checkCudaErrors(
cudaMalloc((void**) &pDevicePatchXs, n_patches * sizeof(float*)) );
checkCudaErrors(
cudaMalloc((void**) &pDevicePatchYs, n_patches * sizeof(float*)) );

checkCudaErrors(
cudaMalloc((void**) &pDeviceTempXX, n_patches * sizeof(float*)) );
checkCudaErrors(
cudaMalloc((void**) &pDeviceTempXY, n_patches * sizeof(float*)) );
checkCudaErrors(
cudaMalloc((void**) &pDeviceTempYY, n_patches * sizeof(float*)) );

pHostDevicePatches = new float*[n_patches];
pHostDevicePatchXs = new float*[n_patches];
pHostDevicePatchYs = new float*[n_patches];

float* pHostDeviceTempXX[n_patches];
float* pHostDeviceTempXY[n_patches];
float* pHostDeviceTempYY[n_patches];

for (int i = 0; i < n_patches; i++) {
checkCudaErrors(
cudaMalloc((void**) &pHostDevicePatches[i], op->n_vals * sizeof(float)) );
checkCudaErrors(
cudaMalloc((void**) &pHostDevicePatchXs[i], op->n_vals * sizeof(float)) );
checkCudaErrors(
cudaMalloc((void**) &pHostDevicePatchYs[i], op->n_vals * sizeof(float)) );

checkCudaErrors(
cudaMalloc((void**) &pHostDeviceTempXX[i], op->n_vals * sizeof(float)) );
checkCudaErrors(
cudaMalloc((void**) &pHostDeviceTempXY[i], op->n_vals * sizeof(float)) );
checkCudaErrors(
cudaMalloc((void**) &pHostDeviceTempYY[i], op->n_vals * sizeof(float)) );
}

checkCudaErrors( cudaMemcpy(pDevicePatches, pHostDevicePatches,
Expand All @@ -112,6 +131,23 @@ namespace OFC {
checkCudaErrors( cudaMemcpy(pDevicePatchYs, pHostDevicePatchYs,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );


checkCudaErrors( cudaMemcpy(pDeviceTempXX, pHostDeviceTempXX,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(pDeviceTempXY, pHostDeviceTempXY,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );
checkCudaErrors( cudaMemcpy(pDeviceTempYY, pHostDeviceTempYY,
n_patches * sizeof(float*), cudaMemcpyHostToDevice) );

// Hessian
H00 = new float[n_patches];
H01 = new float[n_patches];
H11 = new float[n_patches];

checkCudaErrors( cudaMalloc((void**) &pDeviceH00, n_patches * sizeof(float)) );
checkCudaErrors( cudaMalloc((void**) &pDeviceH01, n_patches * sizeof(float)) );
checkCudaErrors( cudaMalloc((void**) &pDeviceH11, n_patches * sizeof(float)) );

aggregateTime = 0.0;
meanTime = 0.0;
extractTime = 0.0;
Expand All @@ -123,6 +159,11 @@ namespace OFC {
cudaFree(pDevicePatches[i]);
cudaFree(pDevicePatchXs[i]);
cudaFree(pDevicePatchYs[i]);

cudaFree(pDeviceTempXX[i]);
cudaFree(pDeviceTempXY[i]);
cudaFree(pDeviceTempYY[i]);

delete patches[i];
}

Expand All @@ -139,6 +180,18 @@ namespace OFC {
cudaFree(pDeviceMidpointX);
cudaFree(pDeviceMidpointY);

cudaFree(pDeviceH00);
cudaFree(pDeviceH01);
cudaFree(pDeviceH11);

delete H00;
delete H01;
delete H11;

cudaFree(pDeviceTempXX);
cudaFree(pDeviceTempXY);
cudaFree(pDeviceTempYY);

}

void PatGridClass::InitializeGrid(const float * _I0, const float * _I0x, const float * _I0y) {
Expand All @@ -148,15 +201,27 @@ namespace OFC {
I0y = _I0y;

gettimeofday(&tv_start, nullptr);
cu::extractPatches(pDevicePatches, pDevicePatchXs, pDevicePatchYs,
I0, I0x, I0y, pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params);

cu::extractPatchesAndHessians(pDevicePatches, pDevicePatchXs, pDevicePatchYs,
I0, I0x, I0y, pDeviceH00, pDeviceH01, pDeviceH11,
pDeviceTempXX, pDeviceTempXY, pDeviceTempYY,
pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params);

checkCudaErrors(
cudaMemcpy(H00, pDeviceH00, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(H01, pDeviceH01, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );
checkCudaErrors(
cudaMemcpy(H11, pDeviceH11, n_patches * sizeof(float), cudaMemcpyDeviceToHost) );

gettimeofday(&tv_end, nullptr);
extractTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f +
(tv_end.tv_usec - tv_start.tv_usec) / 1000.0f;

for (int i = 0; i < n_patches; ++i) {
patches[i]->InitializePatch(pHostDevicePatches[i],
pHostDevicePatchXs[i], pHostDevicePatchYs[i], midpoints_ref[i]);
pHostDevicePatchXs[i], pHostDevicePatchYs[i],
H00[i], H01[i], H11[i], midpoints_ref[i]);
p_init[i].setZero();
}

Expand Down
8 changes: 8 additions & 0 deletions src/patchgrid.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,10 +45,18 @@ namespace OFC {

float* pDeviceWeights, *pDeviceFlowOut;

// Patches
float** pDevicePatches, ** pDevicePatchXs, ** pDevicePatchYs;
float** pHostDevicePatches, **pHostDevicePatchXs, **pHostDevicePatchYs;
float* pDeviceMidpointX, * pDeviceMidpointY;

// Hessian
// TODO: Can we shared memory?
float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY;
float* pDeviceH00, * pDeviceH01, * pDeviceH11;
float* H00, * H01, * H11;


const img_params* i_params;
const opt_params* op;

Expand Down
Loading

0 comments on commit fc67d64

Please sign in to comment.