Skip to content

Commit

Permalink
Merge branch 'pad' into pyramid
Browse files Browse the repository at this point in the history
Conflicts:
	src/CMakeLists.txt
  • Loading branch information
Richard Zhao committed May 4, 2017
2 parents eb3bbf2 + 90c256e commit 5ac4469
Show file tree
Hide file tree
Showing 6 changed files with 280 additions and 22 deletions.
4 changes: 3 additions & 1 deletion src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ set(KERNELS
kernels/warmup.cpp
kernels/sobel.cpp
kernels/pyramid.cpp
kernels/pad.cpp
kernels/resizeGrad.cpp
kernels/resize.cpp)

Expand All @@ -70,9 +71,10 @@ target_link_libraries(flow ${OpenCV_LIBS})

# CUDA sandbox
set(SANDBOX_FILES
sandbox/process_sobel.cpp
# sandbox/process_sobel.cpp
# sandbox/process_resize.cpp
# sandbox/process_resizeGrad.cpp
sandbox/process_pad.cpp
# sandbox/RgbMatTest.cpp
sandbox/sandbox.cpp)
cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
Expand Down
133 changes: 133 additions & 0 deletions src/kernels/pad.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
/**
* Implements a pad kernel
*/

// System
#include <iostream>
#include <chrono>
#include <string>
#include <stdexcept>

// OpenCV
#include <opencv2/opencv.hpp>

// CUDA
#include <cuda.h>
#include <cuda_runtime.h>

// NVIDIA Perf Primitives
#include <nppi.h>
#include <nppi_filtering_functions.h>

// Local
#include "../common/timer.h"

#include "pad.h"

using namespace timer;

namespace cu {

/**
* Perform border padding with constant (0) or replication on src and store it in dest.
* Accepts 3-channel 32-bit float matrices.
*
* Params:
* src input image.
* dst output image;
* it has size of src + padding
* top top padding
* bottom bottom padding
* left left padding
* right right padding
* replicate whether to replicate or constant
*/
void pad(
const cv::Mat& src, cv::Mat& dest, int top,
int bottom, int left, int right, bool replicate) {

if (src.type() != CV_32FC3) {
throw std::invalid_argument("pad: invalid input matrix type");
}

// Compute time of relevant kernel
double compute_time = 0.0;

// CV_32FC3 is made up of RGB floats
int channels = 3;
size_t elemSize = 3 * sizeof(float);

cv::Size sz = src.size();
int width = sz.width;
int height = sz.height;
int destWidth = left + width + right;
int destHeight = top + height + bottom;

std::cout << "[start] pad: processing " << width << "x" << height << " image" << std::endl;

// pSrc pointer to image data
Npp32f* pHostSrc = (float*) src.data;

// The width, in bytes, of the image, sometimes referred to as pitch
unsigned int nSrcStep = width * elemSize;
unsigned int nDstStep = destWidth * elemSize;

NppiSize oSrcSizeROI = { width, height };
// NppiSize oDestSizeROI = { left + width + right, top + height + bottom};
NppiSize oDstSizeROI = { destWidth, destHeight };
const Npp32f padVal[3] = {0.0, 0.0, 0.0};

auto start_cuda_malloc = now();

// Allocate device memory
Npp32f* pDeviceSrc, *pDeviceDst;

checkCudaErrors( cudaMalloc((void**) &pDeviceSrc, width * height * elemSize) );
checkCudaErrors( cudaMalloc((void**) &pDeviceDst, destWidth * destHeight * elemSize) );
checkCudaErrors( cudaMemset(pDeviceDst, 0, destWidth * destHeight * elemSize) );

calc_print_elapsed("cudaMalloc", start_cuda_malloc);


auto start_memcpy_hd = now();

// Copy image to device
checkCudaErrors(
cudaMemcpy(pDeviceSrc, pHostSrc, width * height * elemSize, cudaMemcpyHostToDevice) );

calc_print_elapsed("cudaMemcpy H->D", start_memcpy_hd);


auto start_pad = now();

NPP_CHECK_NPP(

(replicate)
? nppiCopyReplicateBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSizeROI, pDeviceDst,
nDstStep, oDstSizeROI, top, left)
: nppiCopyConstBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSizeROI, pDeviceDst,
nDstStep, oDstSizeROI, top, left, padVal)

);

compute_time += calc_print_elapsed("pad", start_pad);


auto start_memcpy_dh = now();

// Copy result to host
dest.create(destHeight, destWidth, CV_32FC3);

checkCudaErrors(
cudaMemcpy(dest.data, pDeviceDst,
destWidth * destHeight * elemSize, cudaMemcpyDeviceToHost) );

calc_print_elapsed("cudaMemcpy H<-D", start_memcpy_dh);

cudaFree((void*) pDeviceSrc);
cudaFree((void*) pDeviceDst);

std::cout << "[done] pad: primary compute time: " << compute_time << " (ms)" << std::endl;
}

}
52 changes: 52 additions & 0 deletions src/kernels/pad.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
/**
* Implements a pad kernel
*/

#ifndef __KERNEL_PAD_H__
#define __KERNEL_PAD_H__

// System
#include <iostream>
#include <chrono>
#include <string>
#include <stdexcept>

// OpenCV
#include <opencv2/opencv.hpp>

// CUDA
#include <cuda.h>
#include <cuda_runtime.h>

// NVIDIA Perf Primitives
#include <nppi.h>
#include <nppi_filtering_functions.h>

// Local
#include "../common/Exceptions.h"
#include "../common/timer.h"
#include "../sandbox/process.h"

namespace cu {

/**
* Perform border padding with constant (0) or replication on src and store it in dest.
* Accepts 3-channel 32-bit float matrices.
*
* Params:
* src input image.
* dst output image;
* it has size of src + padding
* top top padding
* bottom bottom padding
* left left padding
* right right padding
* replicate whether to replicate or constant pad
*/
void pad(
const cv::Mat& src, cv::Mat& dest, int top,
int bottom, int left, int right, bool replicate);

}

#endif // end __KERNEL_PAD_H__
25 changes: 13 additions & 12 deletions src/oflow.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#include "refine_variational.h"

#include "kernels/resize.h"
#include "kernels/pad.h"
#include "kernels/resizeGrad.h"
#include "kernels/sobel.h"
#include "kernels/pyramid.h"
Expand Down Expand Up @@ -86,22 +87,22 @@ namespace OFC {
for (int i = 0; i <= op.coarsest_scale; ++i) {

// Replicate padding for images
copyMakeBorder(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_REPLICATE);
copyMakeBorder(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_REPLICATE);
cu::pad(I0_mats[i], I0_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, true);
cu::pad(I1_mats[i], I1_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, true);
I0s[i] = (float*) I0_mats[i].data;
I1s[i] = (float*) I1_mats[i].data;

// Zero pad for gradients
copyMakeBorder(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
copyMakeBorder(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
copyMakeBorder(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
copyMakeBorder(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, cv::BORDER_CONSTANT, 0);
cu::pad(I0x_mats[i], I0x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I0y_mats[i], I0y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I1x_mats[i], I1x_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);
cu::pad(I1y_mats[i], I1y_mats[i], op.patch_size, op.patch_size,
op.patch_size, op.patch_size, false);

I0xs[i] = (float*) I0x_mats[i].data;
I0ys[i] = (float*) I0y_mats[i].data;
Expand Down
20 changes: 11 additions & 9 deletions src/run_dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

#include "oflow.h"
#include "kernels/warmup.h"
#include "kernels/pad.h"

// CUDA
#include <cuda_runtime.h>
Expand Down Expand Up @@ -195,6 +196,10 @@ int main( int argc, char** argv ) {
}


// convert to float
I0_mat.convertTo(I0_fmat, CV_32F);
I1_mat.convertTo(I1_fmat, CV_32F);

// Pad image such that width and height are restless divisible on all scales (except last)
int padw = 0, padh = 0;
int max_scale = pow(2, op.coarsest_scale); // enforce restless division by this number on coarsest scale
Expand All @@ -205,24 +210,21 @@ int main( int argc, char** argv ) {

if (padh > 0 || padw > 0) {

copyMakeBorder(I0_mat, I0_mat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
floor((float) padw / 2.0f), ceil((float) padw / 2.0f), cv::BORDER_REPLICATE);
copyMakeBorder(I1_mat, I1_mat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
floor((float) padw / 2.0f), ceil((float) padw / 2.0f), cv::BORDER_REPLICATE);
cu::pad(I0_fmat, I0_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);
cu::pad(I1_fmat, I1_fmat, floor((float) padh / 2.0f), ceil((float) padh / 2.0f),
floor((float) padw / 2.0f), ceil((float) padw / 2.0f), true);

}

// Create image paramaters
img_params iparams;

// padded image size, ensures divisibility by 2 on all scales (except last)
iparams.width = I0_mat.size().width;
iparams.height = I0_mat.size().height;
iparams.width = I0_fmat.size().width;
iparams.height = I0_fmat.size().height;
iparams.padding = op.patch_size;

// convert to float
I0_mat.convertTo(I0_fmat, CV_32F);
I1_mat.convertTo(I1_fmat, CV_32F);

// Timing, image loading
if (op.verbosity > 1) {
Expand Down
68 changes: 68 additions & 0 deletions src/sandbox/process_pad.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
/**
* Test a pad kernel
*/

// System
#include <iostream>
#include <chrono>
#include <string>
#include <stdexcept>

// OpenCV
#include <opencv2/opencv.hpp>

// CUDA
#include <cuda.h>
#include <cuda_runtime.h>

// NVIDIA Perf Primitives
#include <nppi.h>
#include <nppi_filtering_functions.h>

// Local
#include "../common/timer.h"
#include "process.h"

#include "../kernels/pad.h"

using namespace timer;

void process(const char* input_file, const char* output_file) {
cv::Mat I0, I0_f, I1_f;

auto start_read = now();

// Get input
I0 = cv::imread(input_file, CV_LOAD_IMAGE_COLOR);

calc_print_elapsed("imread", start_read);

// Check for invalid input
if(!I0.data) {
std::cout << "Could not open or find the image" << std::endl ;
exit(1);
}

auto start_convert = now();

// Convert to float
I0.convertTo(I0_f, CV_32FC3);

calc_print_elapsed("convertTo float", start_convert);


auto start_resize = now();

cu::pad(I0_f, I1_f, 20, 50, 100, 200, true);

calc_print_elapsed("resize", start_resize);


auto start_write = now();

// Write output
cv::imwrite(output_file, I1_f);

calc_print_elapsed("write", start_write);
}

0 comments on commit 5ac4469

Please sign in to comment.