From fc67d64ea6d6e5620af7c59085130bd966bd2c48 Mon Sep 17 00:00:00 2001 From: Ashwin Sekar Date: Wed, 10 May 2017 02:45:03 -0400 Subject: [PATCH] Batch-fy hessian also, construction time is now very fast < 10 ms --- src/kernels/extract.cu | 66 +++++++++++++++- src/kernels/extract.h | 5 +- src/patch.cpp | 22 ++++-- src/patch.h | 3 +- src/patchgrid.cpp | 73 +++++++++++++++++- src/patchgrid.h | 8 ++ src/timing/batch_extract.time | 103 +++++++++++++++++++++++++ src/timing/batch_extract_and_mean.time | 103 +++++++++++++++++++++++++ src/timing/batch_hessian_too.time | 103 +++++++++++++++++++++++++ 9 files changed, 470 insertions(+), 16 deletions(-) create mode 100644 src/timing/batch_extract.time create mode 100644 src/timing/batch_extract_and_mean.time create mode 100644 src/timing/batch_hessian_too.time diff --git a/src/kernels/extract.cu b/src/kernels/extract.cu index 34c1cc8..f6d6506 100644 --- a/src/kernels/extract.cu +++ b/src/kernels/extract.cu @@ -33,9 +33,11 @@ __global__ void kernelExtractPatch( } -__global__ void kernelExtractPatches( +__global__ void kernelExtractPatchesAndHessians( float** patches, float** patchxs, float** patchys, const float * I0, const float * I0x, const float * I0y, + float* H00, float* H01, float* H11, + float** tempXX, float** tempXY, float** tempYY, float* midpointX, float* midpointY, int padding, int patch_size, int width_pad) { @@ -45,6 +47,9 @@ __global__ void kernelExtractPatches( float* patch = patches[patchId]; float* patchX = patchxs[patchId]; float* patchY = patchys[patchId]; + float* XX = tempXX[patchId]; + float* XY = tempXY[patchId]; + float* YY = tempYY[patchId]; int x = round(midpointX[patchId]) + padding; int y = round(midpointY[patchId]) + padding; @@ -57,8 +62,57 @@ __global__ void kernelExtractPatches( patch[i] = I0[j]; patchX[i] = I0x[j]; patchY[i] = I0y[j]; + XX[i] = patchX[i] * patchX[i]; + XY[i] = patchX[i] * patchY[i]; + YY[i] = patchY[i] * patchY[i]; } + __syncthreads(); + + // Mean normalize + __shared__ float mean; + + if (tid == 0) { + + mean = 0.0; + for (int i = 0; i < patch_size * patch_size * 3; i++) { + mean += patch[i]; + } + mean /= patch_size * patch_size * 3; + + } + + __syncthreads(); + + for (int i = tid; i < patch_size * patch_size * 3; + i+= 3 * patch_size) { + patch[i] -= mean; + } + + // TODO: can this be done in parallel? + if (tid == 0) { + + float h00 = 0.0, h01 = 0.0, h11 = 0.0; + + for (int i = 0; i < patch_size * patch_size * 3; i++) { + h00 += XX[i]; + h01 += XY[i]; + h11 += YY[i]; + } + + // If not invertible adjust values + if (h00 * h11 - h01 * h01 == 0) { + h00 += 1e-10; + h11 += 1e-10; + } + + H00[patchId] = h00; + H01[patchId] = h01; + H11[patchId] = h11; + + } + + } @@ -80,17 +134,21 @@ namespace cu { } - void extractPatches(float** patches, float** patchxs, float** patchys, + void extractPatchesAndHessians( + float** patches, float** patchxs, float** patchys, const float * I0, const float * I0x, const float * I0y, + float* H00, float* H01, float* H11, + float** tempXX, float** tempXY, float** tempYY, float* midpointX, float* midpointY, int n_patches, const opt_params* op, const img_params* i_params) { int nBlocks = n_patches; int nThreadsPerBlock = 3 * op->patch_size; - kernelExtractPatches<<>>( + kernelExtractPatchesAndHessians<<>>( patches, patchxs, patchys, - I0, I0x, I0y, midpointX, midpointY, + I0, I0x, I0y, H00, H01, H11, + tempXX, tempXY, tempYY, midpointX, midpointY, i_params->padding, op->patch_size, i_params->width_pad); } diff --git a/src/kernels/extract.h b/src/kernels/extract.h index 0ac8e1c..85c1a5a 100644 --- a/src/kernels/extract.h +++ b/src/kernels/extract.h @@ -32,8 +32,11 @@ namespace cu { const float* I0, const float* I0x, const float* I0y, int patch_offset, int patch_size, int width_pad); - void extractPatches(float** patches, float** patchxs, float** patchys, + void extractPatchesAndHessians( + float** patches, float** patchxs, float** patchys, const float * I0, const float * I0x, const float * I0y, + float* H00, float* H01, float* H11, + float** tempXX, float** tempXY, float** tempYY, float* midpointX, float* midpointY, int n_patches, const opt_params* op, const img_params* i_params); diff --git a/src/patch.cpp b/src/patch.cpp index 0fb67cd..f48da59 100644 --- a/src/patch.cpp +++ b/src/patch.cpp @@ -92,7 +92,7 @@ namespace OFC { // void PatClass::InitializePatch(const float * _I0, // const float * _I0x, const float * _I0y, const Eigen::Vector2f _midpoint) { void PatClass::InitializePatch(float * _patch, - float * _patchx, float * _patchy, + float * _patchx, float * _patchy, float H00, float H01, float H11, const Eigen::Vector2f _midpoint) { // I0 = _I0; @@ -106,14 +106,20 @@ namespace OFC { midpoint = _midpoint; ResetPatchState(); - ExtractPatch(); - ComputeHessian(); + + p_state->hessian(0,0) = H00; + p_state->hessian(0,1) = H01; + p_state->hessian(1,0) = p_state->hessian(0,1); + p_state->hessian(1,1) = H11; + + //ExtractPatch(); + // ComputeHessian(H00, H01, H11); } - void PatClass::ComputeHessian() { + void PatClass::ComputeHessian(float H00, float H01, float H11) { - gettimeofday(&tv_start, nullptr); + /*gettimeofday(&tv_start, nullptr); CUBLAS_CHECK ( cublasSdot(op->cublasHandle, patch.size(), @@ -131,8 +137,12 @@ namespace OFC { hessianTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; - hessianCalls++; + hessianCalls++;*/ + p_state->hessian(0,0) = H00; + p_state->hessian(0,1) = H01; + p_state->hessian(1,0) = p_state->hessian(0,1); + p_state->hessian(1,1) = H11; // If not invertible adjust values if (p_state->hessian.determinant() == 0) { diff --git a/src/patch.h b/src/patch.h index 6442963..269f58d 100644 --- a/src/patch.h +++ b/src/patch.h @@ -48,6 +48,7 @@ namespace OFC { // const float * _I0y, const Eigen::Vector2f _midpoint); void InitializePatch(float * _patch, float * _patchx, float* _patchy, + float H00, float H01, float H11, const Eigen::Vector2f _midpoint); void SetTargetImage(const float * _I1); @@ -75,7 +76,7 @@ namespace OFC { void OptimizeComputeErrImg(); void UpdateMidpoint(); void ResetPatchState(); - void ComputeHessian(); + void ComputeHessian(float H00, float H01, float H11); void ComputeCostErr(); // Extract patch on integer position, and gradients, No Bilinear interpolation diff --git a/src/patchgrid.cpp b/src/patchgrid.cpp index af9260c..47b0363 100644 --- a/src/patchgrid.cpp +++ b/src/patchgrid.cpp @@ -85,7 +85,7 @@ namespace OFC { checkCudaErrors( cudaMalloc ((void**) &pDeviceFlowOut, i_params->width * i_params->height * 2 * sizeof(float)) ); - // Patches + // Patches and Hessians checkCudaErrors( cudaMalloc((void**) &pDevicePatches, n_patches * sizeof(float*)) ); checkCudaErrors( @@ -93,9 +93,21 @@ namespace OFC { checkCudaErrors( cudaMalloc((void**) &pDevicePatchYs, n_patches * sizeof(float*)) ); + checkCudaErrors( + cudaMalloc((void**) &pDeviceTempXX, n_patches * sizeof(float*)) ); + checkCudaErrors( + cudaMalloc((void**) &pDeviceTempXY, n_patches * sizeof(float*)) ); + checkCudaErrors( + cudaMalloc((void**) &pDeviceTempYY, n_patches * sizeof(float*)) ); + pHostDevicePatches = new float*[n_patches]; pHostDevicePatchXs = new float*[n_patches]; pHostDevicePatchYs = new float*[n_patches]; + + float* pHostDeviceTempXX[n_patches]; + float* pHostDeviceTempXY[n_patches]; + float* pHostDeviceTempYY[n_patches]; + for (int i = 0; i < n_patches; i++) { checkCudaErrors( cudaMalloc((void**) &pHostDevicePatches[i], op->n_vals * sizeof(float)) ); @@ -103,6 +115,13 @@ namespace OFC { cudaMalloc((void**) &pHostDevicePatchXs[i], op->n_vals * sizeof(float)) ); checkCudaErrors( cudaMalloc((void**) &pHostDevicePatchYs[i], op->n_vals * sizeof(float)) ); + + checkCudaErrors( + cudaMalloc((void**) &pHostDeviceTempXX[i], op->n_vals * sizeof(float)) ); + checkCudaErrors( + cudaMalloc((void**) &pHostDeviceTempXY[i], op->n_vals * sizeof(float)) ); + checkCudaErrors( + cudaMalloc((void**) &pHostDeviceTempYY[i], op->n_vals * sizeof(float)) ); } checkCudaErrors( cudaMemcpy(pDevicePatches, pHostDevicePatches, @@ -112,6 +131,23 @@ namespace OFC { checkCudaErrors( cudaMemcpy(pDevicePatchYs, pHostDevicePatchYs, n_patches * sizeof(float*), cudaMemcpyHostToDevice) ); + + checkCudaErrors( cudaMemcpy(pDeviceTempXX, pHostDeviceTempXX, + n_patches * sizeof(float*), cudaMemcpyHostToDevice) ); + checkCudaErrors( cudaMemcpy(pDeviceTempXY, pHostDeviceTempXY, + n_patches * sizeof(float*), cudaMemcpyHostToDevice) ); + checkCudaErrors( cudaMemcpy(pDeviceTempYY, pHostDeviceTempYY, + n_patches * sizeof(float*), cudaMemcpyHostToDevice) ); + + // Hessian + H00 = new float[n_patches]; + H01 = new float[n_patches]; + H11 = new float[n_patches]; + + checkCudaErrors( cudaMalloc((void**) &pDeviceH00, n_patches * sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**) &pDeviceH01, n_patches * sizeof(float)) ); + checkCudaErrors( cudaMalloc((void**) &pDeviceH11, n_patches * sizeof(float)) ); + aggregateTime = 0.0; meanTime = 0.0; extractTime = 0.0; @@ -123,6 +159,11 @@ namespace OFC { cudaFree(pDevicePatches[i]); cudaFree(pDevicePatchXs[i]); cudaFree(pDevicePatchYs[i]); + + cudaFree(pDeviceTempXX[i]); + cudaFree(pDeviceTempXY[i]); + cudaFree(pDeviceTempYY[i]); + delete patches[i]; } @@ -139,6 +180,18 @@ namespace OFC { cudaFree(pDeviceMidpointX); cudaFree(pDeviceMidpointY); + cudaFree(pDeviceH00); + cudaFree(pDeviceH01); + cudaFree(pDeviceH11); + + delete H00; + delete H01; + delete H11; + + cudaFree(pDeviceTempXX); + cudaFree(pDeviceTempXY); + cudaFree(pDeviceTempYY); + } void PatGridClass::InitializeGrid(const float * _I0, const float * _I0x, const float * _I0y) { @@ -148,15 +201,27 @@ namespace OFC { I0y = _I0y; gettimeofday(&tv_start, nullptr); - cu::extractPatches(pDevicePatches, pDevicePatchXs, pDevicePatchYs, - I0, I0x, I0y, pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params); + + cu::extractPatchesAndHessians(pDevicePatches, pDevicePatchXs, pDevicePatchYs, + I0, I0x, I0y, pDeviceH00, pDeviceH01, pDeviceH11, + pDeviceTempXX, pDeviceTempXY, pDeviceTempYY, + pDeviceMidpointX, pDeviceMidpointY, n_patches, op, i_params); + + checkCudaErrors( + cudaMemcpy(H00, pDeviceH00, n_patches * sizeof(float), cudaMemcpyDeviceToHost) ); + checkCudaErrors( + cudaMemcpy(H01, pDeviceH01, n_patches * sizeof(float), cudaMemcpyDeviceToHost) ); + checkCudaErrors( + cudaMemcpy(H11, pDeviceH11, n_patches * sizeof(float), cudaMemcpyDeviceToHost) ); + gettimeofday(&tv_end, nullptr); extractTime += (tv_end.tv_sec - tv_start.tv_sec) * 1000.0f + (tv_end.tv_usec - tv_start.tv_usec) / 1000.0f; for (int i = 0; i < n_patches; ++i) { patches[i]->InitializePatch(pHostDevicePatches[i], - pHostDevicePatchXs[i], pHostDevicePatchYs[i], midpoints_ref[i]); + pHostDevicePatchXs[i], pHostDevicePatchYs[i], + H00[i], H01[i], H11[i], midpoints_ref[i]); p_init[i].setZero(); } diff --git a/src/patchgrid.h b/src/patchgrid.h index 1767740..36881f8 100644 --- a/src/patchgrid.h +++ b/src/patchgrid.h @@ -45,10 +45,18 @@ namespace OFC { float* pDeviceWeights, *pDeviceFlowOut; + // Patches float** pDevicePatches, ** pDevicePatchXs, ** pDevicePatchYs; float** pHostDevicePatches, **pHostDevicePatchXs, **pHostDevicePatchYs; float* pDeviceMidpointX, * pDeviceMidpointY; + // Hessian + // TODO: Can we shared memory? + float** pDeviceTempXX, ** pDeviceTempXY, ** pDeviceTempYY; + float* pDeviceH00, * pDeviceH01, * pDeviceH11; + float* H00, * H01, * H11; + + const img_params* i_params; const opt_params* op; diff --git a/src/timing/batch_extract.time b/src/timing/batch_extract.time new file mode 100644 index 0000000..5646e82 --- /dev/null +++ b/src/timing/batch_extract.time @@ -0,0 +1,103 @@ +[start] warmup: processing 720x480 image +[time] 659.243 (ms) : cudaMalloc +[time] 0.162 (ms) : cudaMemcpy H->D +[time] 263.46 (ms) : warmup +[time] 0.099 (ms) : cudaMemcpy H<-D +[done] warmup: primary compute time: 263.46 (ms) +[time] 0.476 (ms) : I0, I1 cudaMalloc +[time] 0.991 (ms) : cudaMemcpy I0, I1 H->D +[start] pad: processing 1024x436 image +[time] 0.212 (ms) : cudaMalloc +[time] 0.023 (ms) : pad +[done] pad: primary compute time: 0.023 (ms) +[start] pad: processing 1024x436 image +[time] 0.211 (ms) : cudaMalloc +[time] 0.01 (ms) : pad +[done] pad: primary compute time: 0.01 (ms) +TIME (Image loading ) (ms): 27.635 +TIME (Grid Memo. Alloc. ) (ms): 39.898 +TIME (Setup) (ms): 219.219 +I0 448x1024 +Constructing pyramids +[start] constructImgPyramids: processing 1024x448 image +[time] 1.73 (ms) : sobel: Ixs[0] +[time] 0.283 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.278 (ms) : sobel: Ixs[i] +[time] 0.289 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.275 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.28 (ms) : sobel: Ixs[i] +[time] 0.28 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.276 (ms) : sobel: Ixs[i] +[time] 0.277 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.289 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +[time] 4.987 (ms) : total time +[done] constructImgPyramids: primmary compute time: 4.815 +[start] constructImgPyramids: processing 1024x448 image +[time] 0.273 (ms) : sobel: Ixs[0] +[time] 0.284 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.277 (ms) : sobel: Ixs[i] +[time] 0.278 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.289 (ms) : sobel: Ixs[i] +[time] 0.302 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.296 (ms) : sobel: Ixs[i] +[time] 0.297 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.297 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.275 (ms) : sobel: Ixs[i] +[time] 0.285 (ms) : sobel: Iys[i] +[time] 3.561 (ms) : total time +[done] constructImgPyramids: primmary compute time: 3.432 +TIME (Pyramids+Gradients) (ms): 8.56 +, cflow +TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 2.21 0.00 33.41 0.14 0.29 -> 36.05 ms. +TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 7.45 0.00 116.82 0.39 0.74 -> 125.40 ms. +TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 29.80 0.00 444.32 1.42 2.14 -> 477.67 ms. +TIME (O.Flow Run-Time ) (ms): 639.141 + +===============Timings (ms)=============== +Avg grad descent iterations: 12.9933 +[hessian] 20 tot => 0.0446429 avg +[project] 155.165 tot => 0.0288787 avg +[cost] 123.778 tot => 0.021264 avg +[interpolate] 41.283 tot => 0.00709208 avg +[mean norm] 119.738 tot => 0.0191 avg +[extract] 0.007 +[aggregate] 1.382 +[flow norm] 0.037 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 4.982 tot => 0.0444821 avg +[project] 40.511 tot => 0.0301421 avg +[cost] 32.868 tot => 0.0225742 avg +[interpolate] 10.571 tot => 0.0072603 avg +[mean norm] 31.73 tot => 0.020236 avg +[extract] 0.007 +[aggregate] 0.378 +[flow norm] 0.015 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 1.44 tot => 0.045 avg +[project] 11.583 tot => 0.0301641 avg +[cost] 9.426 tot => 0.0226587 avg +[interpolate] 3.026 tot => 0.00727404 avg +[mean norm] 9.069 tot => 0.0202433 avg +[extract] 0.023 +[aggregate] 0.119 +[flow norm] 0.014 +========================================== diff --git a/src/timing/batch_extract_and_mean.time b/src/timing/batch_extract_and_mean.time new file mode 100644 index 0000000..496accb --- /dev/null +++ b/src/timing/batch_extract_and_mean.time @@ -0,0 +1,103 @@ +[start] warmup: processing 720x480 image +[time] 655.99 (ms) : cudaMalloc +[time] 0.106 (ms) : cudaMemcpy H->D +[time] 213.393 (ms) : warmup +[time] 0.084 (ms) : cudaMemcpy H<-D +[done] warmup: primary compute time: 213.393 (ms) +[time] 0.433 (ms) : I0, I1 cudaMalloc +[time] 1.008 (ms) : cudaMemcpy I0, I1 H->D +[start] pad: processing 1024x436 image +[time] 0.199 (ms) : cudaMalloc +[time] 0.019 (ms) : pad +[done] pad: primary compute time: 0.019 (ms) +[start] pad: processing 1024x436 image +[time] 0.195 (ms) : cudaMalloc +[time] 0.009 (ms) : pad +[done] pad: primary compute time: 0.009 (ms) +TIME (Image loading ) (ms): 24.99 +TIME (Grid Memo. Alloc. ) (ms): 36.493 +TIME (Setup) (ms): 195.469 +I0 448x1024 +Constructing pyramids +[start] constructImgPyramids: processing 1024x448 image +[time] 1.675 (ms) : sobel: Ixs[0] +[time] 0.296 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.274 (ms) : sobel: Ixs[i] +[time] 0.266 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.277 (ms) : sobel: Ixs[i] +[time] 0.267 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.265 (ms) : sobel: Ixs[i] +[time] 0.271 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.272 (ms) : sobel: Ixs[i] +[time] 0.27 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.268 (ms) : sobel: Ixs[i] +[time] 0.269 (ms) : sobel: Iys[i] +[time] 4.837 (ms) : total time +[done] constructImgPyramids: primmary compute time: 4.67 +[start] constructImgPyramids: processing 1024x448 image +[time] 0.264 (ms) : sobel: Ixs[0] +[time] 0.264 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.265 (ms) : sobel: Ixs[i] +[time] 0.279 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.27 (ms) : sobel: Ixs[i] +[time] 0.274 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.27 (ms) : sobel: Ixs[i] +[time] 0.267 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.265 (ms) : sobel: Ixs[i] +[time] 0.265 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.271 (ms) : sobel: Ixs[i] +[time] 0.264 (ms) : sobel: Iys[i] +[time] 3.344 (ms) : total time +[done] constructImgPyramids: primmary compute time: 3.218 +TIME (Pyramids+Gradients) (ms): 8.191 +, cflow +TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 1.53 0.00 31.91 0.13 0.25 -> 33.81 ms. +TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 4.96 0.00 110.87 0.35 0.65 -> 116.84 ms. +TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 18.77 0.00 424.14 1.33 2.10 -> 446.35 ms. +TIME (O.Flow Run-Time ) (ms): 597.018 + +===============Timings (ms)=============== +Avg grad descent iterations: 12.9933 +[hessian] 18.47 tot => 0.0412277 avg +[project] 149.086 tot => 0.0277473 avg +[cost] 118.145 tot => 0.0202963 avg +[interpolate] 37.841 tot => 0.00650077 avg +[mean norm] 105.728 tot => 0.0181632 avg +[extract] 0.007 +[aggregate] 1.288 +[flow norm] 0.038 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 4.879 tot => 0.0435625 avg +[project] 39.264 tot => 0.0292143 avg +[cost] 30.855 tot => 0.0211916 avg +[interpolate] 9.481 tot => 0.00651168 avg +[mean norm] 27.972 tot => 0.0192115 avg +[extract] 0.007 +[aggregate] 0.327 +[flow norm] 0.02 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 1.487 tot => 0.0464688 avg +[project] 11.26 tot => 0.0293229 avg +[cost] 8.915 tot => 0.0214303 avg +[interpolate] 2.723 tot => 0.00654567 avg +[mean norm] 8.038 tot => 0.0193221 avg +[extract] 0.02 +[aggregate] 0.111 +[flow norm] 0.014 +========================================== diff --git a/src/timing/batch_hessian_too.time b/src/timing/batch_hessian_too.time new file mode 100644 index 0000000..cda9202 --- /dev/null +++ b/src/timing/batch_hessian_too.time @@ -0,0 +1,103 @@ +[start] warmup: processing 720x480 image +[time] 665.115 (ms) : cudaMalloc +[time] 0.11 (ms) : cudaMemcpy H->D +[time] 216.189 (ms) : warmup +[time] 0.221 (ms) : cudaMemcpy H<-D +[done] warmup: primary compute time: 216.189 (ms) +[time] 0.424 (ms) : I0, I1 cudaMalloc +[time] 0.993 (ms) : cudaMemcpy I0, I1 H->D +[start] pad: processing 1024x436 image +[time] 0.219 (ms) : cudaMalloc +[time] 0.026 (ms) : pad +[done] pad: primary compute time: 0.026 (ms) +[start] pad: processing 1024x436 image +[time] 0.202 (ms) : cudaMalloc +[time] 0.012 (ms) : pad +[done] pad: primary compute time: 0.012 (ms) +TIME (Image loading ) (ms): 25.133 +TIME (Grid Memo. Alloc. ) (ms): 47.809 +TIME (Setup) (ms): 207.694 +I0 448x1024 +Constructing pyramids +[start] constructImgPyramids: processing 1024x448 image +[time] 1.728 (ms) : sobel: Ixs[0] +[time] 0.327 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.279 (ms) : sobel: Ixs[i] +[time] 0.276 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.281 (ms) : sobel: Ixs[i] +[time] 0.268 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.268 (ms) : sobel: Ixs[i] +[time] 0.273 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.267 (ms) : sobel: Ixs[i] +[time] 0.28 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.268 (ms) : sobel: Ixs[i] +[time] 0.275 (ms) : sobel: Iys[i] +[time] 4.991 (ms) : total time +[done] constructImgPyramids: primmary compute time: 4.79 +[start] constructImgPyramids: processing 1024x448 image +[time] 0.265 (ms) : sobel: Ixs[0] +[time] 0.267 (ms) : sobel: Iys[0] +constructImgPyramids level 1: 512x224 +[time] 0.272 (ms) : sobel: Ixs[i] +[time] 0.271 (ms) : sobel: Iys[i] +constructImgPyramids level 2: 256x112 +[time] 0.266 (ms) : sobel: Ixs[i] +[time] 0.266 (ms) : sobel: Iys[i] +constructImgPyramids level 3: 128x56 +[time] 0.272 (ms) : sobel: Ixs[i] +[time] 0.265 (ms) : sobel: Iys[i] +constructImgPyramids level 4: 64x28 +[time] 0.266 (ms) : sobel: Ixs[i] +[time] 0.266 (ms) : sobel: Iys[i] +constructImgPyramids level 5: 32x14 +[time] 0.274 (ms) : sobel: Ixs[i] +[time] 0.265 (ms) : sobel: Iys[i] +[time] 3.341 (ms) : total time +[done] constructImgPyramids: primmary compute time: 3.215 +TIME (Pyramids+Gradients) (ms): 8.342 +, cflow +TIME (Sc: 5, #p: 32, pconst, pinit, poptim, cflow, tvopt, total): 0.08 0.00 32.09 0.12 0.26 -> 32.54 ms. +TIME (Sc: 4, #p: 112, pconst, pinit, poptim, cflow, tvopt, total): 0.07 0.00 112.14 0.35 0.66 -> 113.21 ms. +TIME (Sc: 3, #p: 448, pconst, pinit, poptim, cflow, tvopt, total): 0.10 0.00 436.45 1.41 2.18 -> 440.14 ms. +TIME (O.Flow Run-Time ) (ms): 585.922 + +===============Timings (ms)=============== +Avg grad descent iterations: 12.9933 +[hessian] 0 tot => -nan avg +[project] 153.341 tot => 0.0285392 avg +[cost] 122.301 tot => 0.0210103 avg +[interpolate] 38.586 tot => 0.00662876 avg +[mean norm] 108.655 tot => 0.018666 avg +[extract] 0.074 +[aggregate] 1.372 +[flow norm] 0.04 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 0 tot => -nan avg +[project] 39.555 tot => 0.0294308 avg +[cost] 31.504 tot => 0.0216374 avg +[interpolate] 9.531 tot => 0.00654602 avg +[mean norm] 28.148 tot => 0.0193324 avg +[extract] 0.056 +[aggregate] 0.328 +[flow norm] 0.017 +========================================== + +===============Timings (ms)=============== +Avg grad descent iterations: 13 +[hessian] 0 tot => -nan avg +[project] 11.309 tot => 0.0294505 avg +[cost] 8.998 tot => 0.0216298 avg +[interpolate] 2.746 tot => 0.00660096 avg +[mean norm] 8.051 tot => 0.0193534 avg +[extract] 0.073 +[aggregate] 0.108 +[flow norm] 0.013 +==========================================