diff --git a/src/kernels/densify.cu b/src/kernels/densify.cu index fa3ed9a..e964794 100644 --- a/src/kernels/densify.cu +++ b/src/kernels/densify.cu @@ -53,18 +53,16 @@ __global__ void kernelDensifyPatch( __global__ void kernelDensifyPatches( float** costs, float* flow, float* weights, - float* flowXs, float* flowYs, bool* valid, - float* midpointX, float* midpointY, + dev_patch_state* states, int width, int height, int patch_size, float minErrVal) { int patchId = blockIdx.x; int tid = threadIdx.x; - if (!valid[patchId]) return; int lower_bound = -patch_size / 2; - int xt = midpointX[patchId] + lower_bound; - int yt = midpointY[patchId] + lower_bound; + int xt = states[patchId].midpoint_orgx + lower_bound; + int yt = states[patchId].midpoint_orgy + lower_bound; int offset = (xt + yt * width) + tid; float* cost = costs[patchId]; @@ -82,8 +80,8 @@ __global__ void kernelDensifyPatches( // Weight contribution RGB atomicAdd(&weights[j], absw); - atomicAdd(&flow[2 * j], flowXs[patchId] * absw); - atomicAdd(&flow[2 * j + 1], flowYs[patchId] * absw); + atomicAdd(&flow[2 * j], states[patchId].p_curx * absw); + atomicAdd(&flow[2 * j + 1], states[patchId].p_cury * absw); } } @@ -92,13 +90,13 @@ __global__ void kernelDensifyPatches( __global__ void kernelNormalizeFlow( - float* pDeviceFlowOut, float* pDeviceWeights, int N) { + float* pDeviceFlowOut, float* pDeviceWeights, int N, int numBlocks) { int i = blockIdx.x * blockDim.x + threadIdx.x; - if (i < N && pDeviceWeights[i] > 0) { - pDeviceFlowOut[2 * i] /= pDeviceWeights[i]; - pDeviceFlowOut[2 * i + 1] /= pDeviceWeights[i]; + for (; i < N; i+= blockDim.x * numBlocks) { + if (pDeviceWeights[i / 2] > 0) + pDeviceFlowOut[i] /= pDeviceWeights[i / 2]; } } @@ -127,15 +125,14 @@ namespace cu { float* pDeviceFlowOut, float* pDeviceWeights, int N) { int nThreadsPerBlock = 64; - int nBlocks = (N + nThreadsPerBlock - 1) / nThreadsPerBlock; + int nBlocks = 10; - kernelNormalizeFlow<<>>(pDeviceFlowOut, pDeviceWeights, N); + kernelNormalizeFlow<<>>(pDeviceFlowOut, pDeviceWeights, N, nBlocks); } void densifyPatches( float** costs, float* flow, float* weights, - float* flowXs, float* flowYs, bool* valid, - float* midpointX, float* midpointY, int n_patches, + dev_patch_state* states, int n_patches, const opt_params* op, const img_params* i_params) { int nBlocks = n_patches; @@ -143,8 +140,7 @@ namespace cu { kernelDensifyPatches<<>>( costs, flow, weights, - flowXs, flowYs, valid, - midpointX, midpointY, + states, i_params->width, i_params->height, op->patch_size, op->min_errval); diff --git a/src/kernels/densify.h b/src/kernels/densify.h index 096b4ef..fbcfdc7 100644 --- a/src/kernels/densify.h +++ b/src/kernels/densify.h @@ -35,8 +35,7 @@ namespace cu { void densifyPatches( float** costs, float* flow, float* weights, - float* flowXs, float* flowYs, bool* valid, - float* midpointX, float* midpointY, int n_patches, + dev_patch_state* states, int n_patches, const opt_params* op, const img_params* i_params); diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp index 87d2e30..2d5bedc 100644 --- a/src/patchgrid.cpp +++ b/src/patchgrid.cpp @@ -171,6 +171,11 @@ namespace OFC { checkCudaErrors( cudaMemcpy(pDevicePatchStates, pHostDevicePatchStates, n_patches * sizeof(dev_patch_state), cudaMemcpyHostToDevice) ); + // Prev flow + int flow_size = i_params->width * i_params->height / 2; + checkCudaErrors( + cudaMalloc ((void**) &pDevFlowPrev, flow_size * sizeof(float)) ); + // Hessian H00 = new float[n_patches]; @@ -185,6 +190,7 @@ namespace OFC { meanTime = 0.0; extractTime = 0.0; optiTime = 0.0; + coarseTime = 0.0; } @@ -223,6 +229,8 @@ namespace OFC { delete pHostDeviceTempXY; delete pHostDeviceTempYY; + cudaFree(pDevFlowPrev); + cudaFree(pDeviceH00); cudaFree(pDeviceH01); cudaFree(pDeviceH11); @@ -280,57 +288,24 @@ namespace OFC { void PatGridClass::InitializeFromCoarserOF(const float * flow_prev) { - float * devFlowPrev; int flow_size = i_params->width * i_params->height / 2; - checkCudaErrors( - cudaMalloc ((void**) &devFlowPrev, flow_size * sizeof(float)) ); - checkCudaErrors( cudaMemcpy(devFlowPrev, flow_prev, + checkCudaErrors( cudaMemcpy(pDevFlowPrev, flow_prev, flow_size * sizeof(float), cudaMemcpyHostToDevice) ); - cu::initCoarserOF(devFlowPrev, pDevicePatchStates, + gettimeofday(&tv_start, nullptr); + + cu::initCoarserOF(pDevFlowPrev, pDevicePatchStates, n_patches, i_params); + gettimeofday(&tv_end, nullptr); + coarseTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; } void PatGridClass::AggregateFlowDense(float *flowout) { - /*bool isValid[n_patches]; - float flowXs[n_patches]; - float flowYs[n_patches]; - float* costs[n_patches]; - - for (int i = 0; i < n_patches; i++) { - isValid[i] = patches[i]->IsValid(); - flowXs[i] = (*(patches[i]->GetCurP()))[0]; - flowYs[i] = (*(patches[i]->GetCurP()))[1]; - costs[i] = patches[i]->GetDeviceCostDiffPtr(); - } - - bool *deviceIsValid; - float* deviceFlowXs, * deviceFlowYs; - float** deviceCosts; - - checkCudaErrors( - cudaMalloc ((void**) &deviceIsValid, n_patches * sizeof(bool)) ); - checkCudaErrors( - cudaMalloc ((void**) &deviceFlowXs, n_patches * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &deviceFlowYs, n_patches * sizeof(float)) ); - checkCudaErrors( - cudaMalloc ((void**) &deviceCosts, n_patches * sizeof(float*)) ); - - checkCudaErrors( cudaMemcpy(deviceIsValid, isValid, - n_patches * sizeof(bool), cudaMemcpyHostToDevice) ); - checkCudaErrors( cudaMemcpy(deviceFlowXs, flowXs, - n_patches * sizeof(float), cudaMemcpyHostToDevice) ); - checkCudaErrors( cudaMemcpy(deviceFlowYs, flowYs, - n_patches * sizeof(float), cudaMemcpyHostToDevice) ); - checkCudaErrors( cudaMemcpy(deviceCosts, costs, - n_patches * sizeof(float*), cudaMemcpyHostToDevice) );*/ - - gettimeofday(&tv_start, nullptr); // Device mem @@ -339,39 +314,28 @@ namespace OFC { checkCudaErrors( cudaMemset (pDeviceFlowOut, 0.0, i_params->width * i_params->height * 2 * sizeof(float)) ); - /*cu::densifyPatches( - deviceCosts, pDeviceFlowOut, pDeviceWeights, - deviceFlowXs, deviceFlowYs, deviceIsValid, - pDeviceMidpointX, pDeviceMidpointY, n_patches, - op, i_params);*/ - for (int ip = 0; ip < n_patches; ++ip) { - - float* pweight = pHostDeviceCosts[ip]; // use image error as weight - - cu::densifyPatch( - pweight, pDeviceFlowOut, pDeviceWeights, - pDevicePatchStates, ip, - midpoints_ref[ip][0], midpoints_ref[ip][1], - i_params->width, i_params->height, - op->patch_size, op->min_errval); - - } + cu::densifyPatches( + pDeviceCosts, pDeviceFlowOut, pDeviceWeights, + pDevicePatchStates, n_patches, op, i_params); gettimeofday(&tv_end, nullptr); aggregateTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; gettimeofday(&tv_start, nullptr); + // Normalize all pixels - cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, i_params->width * i_params->height); + cu::normalizeFlow(pDeviceFlowOut, pDeviceWeights, 2 * i_params->width * i_params->height); + + gettimeofday(&tv_end, nullptr); + meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; checkCudaErrors( cudaMemcpy(flowout, pDeviceFlowOut, i_params->width * i_params->height * 2 * sizeof(float), cudaMemcpyDeviceToHost) ); - gettimeofday(&tv_end, nullptr); - meanTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + - (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; + } @@ -380,6 +344,7 @@ namespace OFC { cout << endl; cout << "===============Timings (ms)===============" << endl; cout << "[extract] " << extractTime << endl; + cout << "[coarse] " << coarseTime << endl; cout << "[optiTime] " << optiTime << endl; cout << "[aggregate] " << aggregateTime << endl; cout << "[flow norm] " << meanTime << endl; diff --git a/src/patchgrid.h b/src/patchgrid.h index 389fff7..ddfb2ae 100644 --- a/src/patchgrid.h +++ b/src/patchgrid.h @@ -52,6 +52,9 @@ namespace OFC { float** pDeviceRaws, **pDeviceCosts; float** pHostDeviceRaws, **pHostDeviceCosts; + // Previous flow + float* pDevFlowPrev; + // Hessian // TODO: Can we shared memory? float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY; @@ -72,6 +75,7 @@ namespace OFC { double aggregateTime; double meanTime; double extractTime; + double coarseTime; double optiTime; // float* midpointX_host;