From e9c83cdc7af4823d42c5cb0273e05f225e26ac02 Mon Sep 17 00:00:00 2001 From: Richard Zhao Date: Wed, 10 May 2017 21:38:33 +0000 Subject: [PATCH] Add unified memory setup --- src/CMakeLists.txt | 24 ++--- src/common/cuda_helper.h | 7 ++ src/kernels/sobelUnifiedMem.cpp | 164 ++++++++++++++++++++++++++++++++ src/run_dense.cpp | 3 + 4 files changed, 187 insertions(+), 11 deletions(-) create mode 100644 src/kernels/sobelUnifiedMem.cpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index d43564f..a7c7cd5 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -80,7 +80,8 @@ set(KERNELS kernels/interpolate.cu kernels/extract.cu kernels/warmup.cpp - kernels/sobel.cpp + # kernels/sobel.cpp + kernels/sobelUnifiedMem.cpp kernels/pyramid.cpp kernels/pad.cpp kernels/resizeGrad.cpp @@ -109,13 +110,14 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") # # set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD target_link_libraries(flow ${OpenCV_LIBS}) -# # CUDA sandbox -# set(SANDBOX_FILES -# # sandbox/process_sobel.cpp -# sandbox/process_resize.cpp -# # sandbox/process_resizeGrad.cpp -# # sandbox/process_pad.cpp -# # sandbox/RgbMatTest.cpp -# sandbox/sandbox.cpp) -# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) -# target_link_libraries(sandbox ${OpenCV_LIBS}) +# CUDA sandbox +set(SANDBOX_FILES + # sandbox/process_sobel.cpp + sandbox/process_sobel.cpp + # sandbox/process_resize.cpp + # sandbox/process_resizeGrad.cpp + # sandbox/process_pad.cpp + # sandbox/RgbMatTest.cpp + sandbox/sandbox.cpp) +cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES}) +target_link_libraries(sandbox ${OpenCV_LIBS}) diff --git a/src/common/cuda_helper.h b/src/common/cuda_helper.h index ebd51a1..0cc8e14 100644 --- a/src/common/cuda_helper.h +++ b/src/common/cuda_helper.h @@ -517,6 +517,13 @@ inline void initializeCuda(int argc, char** argv) { //Get GPU information checkCudaErrors(cudaGetDevice(&devID)); checkCudaErrors(cudaGetDeviceProperties(&props, devID)); + + if (props.canMapHostMemory) { + if (props.integrated) + printf("Device is an integrated GPU\n"); + printf("Device supports zero copy, enabling page locked memory mapping\n\n"); + checkCudaErrors( cudaSetDeviceFlags(cudaDeviceMapHost) ); + } } #endif // end __CUDA_HELPER_H__ diff --git a/src/kernels/sobelUnifiedMem.cpp b/src/kernels/sobelUnifiedMem.cpp new file mode 100644 index 0000000..dece9de --- /dev/null +++ b/src/kernels/sobelUnifiedMem.cpp @@ -0,0 +1,164 @@ +/** + * Implements a sobel kernel + */ + +// System +#include +#include +#include +#include +#include + +// OpenCV +#include + +// CUDA +#include +#include + +// NVIDIA Perf Primitives +#include +#include + +// Local +#include "../common/timer.h" + +#include "sobel.h" + +using namespace timer; + +namespace cu { + + /** + * Perform a sobel filter on src and store it in dest. + * dest must be allocated. + * Accepts 3-channel 32-bit float matrices. + * + * Params: + * src input image. + * dst output image of the same size and the same number of channels as src . + * ddepth output image depth. + * dx order of the derivative x. Only 0, 1 supported. + * dy order of the derivative y. Only 0, 1 supported. + * ksize (unused) size of the extended Sobel kernel; it must be 1, 3, 5, or 7. + * scale (unused) optional scale factor for the computed derivative values + * delta (optional|unused) delta value that is added to the results prior to storing them in dst. + * borderType (unused) pixel extrapolation method, see cv::BorderTypes + */ + void sobel( + const cv::Mat& src, cv::Mat& dest, int ddepth, int dx, int dy, + int ksize, double scale, double delta, int borderType) { + + if (src.type() != CV_32FC3) { + throw std::invalid_argument("sobel: invalid input matrix type"); + } + + if ( !((dx == 1 && dy == 0) || + (dx == 0 && dy == 1)) ) { + throw std::invalid_argument("sobel: only accepts first order derivatives"); + } + + // Compute time of relevant kernel + double compute_time = 0.0; + double total_time = 0.0; + + // CV_32FC3 is made up of RGB floats + int channels = 3; + size_t elemSize = 3 * sizeof(float); + + cv::Size sz = src.size(); + int width = sz.width; + int height = sz.height; + + std::cout << "[start] sobel: processing " << width << "x" << height << " image" << std::endl; + + // pSrc pointer to image data + Npp32f* pHostSrc = (float*) src.data; + + // The width, in bytes, of the image, sometimes referred to as pitch + unsigned int nSrcStep = width * elemSize; + unsigned int nDstStep = nSrcStep; + + NppiSize oSrcSize = { width, height }; + NppiPoint oSrcOffset = { 0, 0 }; + NppiSize oSizeROI = { width, height }; + + // For 1D convolution + const Npp32f pKernel[3] = { 1, 0, -1 }; + Npp32s nMaskSize = 3; + Npp32s nAnchor = 1; // Kernel is centered over pixel + NppiBorderType eBorderType = NPP_BORDER_REPLICATE; + + auto start_cuda_malloc = now(); + + // Allocate device memory + Npp32f* pDeviceSrc, *pDeviceDst; + + checkCudaErrors( cudaHostAlloc((void**) &pDeviceSrc, width * height * elemSize, cudaHostAllocMapped) ); + checkCudaErrors( cudaHostAlloc((void**) &pDeviceDst, width * height * elemSize, cudaHostAllocMapped) ); + + // For custom row/col kernel + Npp32f* pDeviceKernel; + checkCudaErrors( cudaHostAlloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f), cudaHostAllocMapped) ); + + calc_print_elapsed("cudaHostAlloc", start_cuda_malloc); + + + auto start_memcpy_hd = now(); + + std::memcpy(pDeviceSrc, pHostSrc, width * height * elemSize); + std::memcpy(pDeviceKernel, pKernel, nMaskSize * sizeof(Npp32f)); + + calc_print_elapsed("std::memcpy D->H", start_memcpy_hd); + + + bool useHoriz = (dx == 1); + + auto start_sobel = now(); + + NPP_CHECK_NPP( + (useHoriz) + // For built in sobel + // ? nppiFilterPrewittHorizBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType) + // : nppiFilterPrewittVertBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType) + + // Custom row filter + ? nppiFilterRowBorder_32f_C3R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType) + + : nppiFilterColumnBorder_32f_C3R ( + pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, + pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType) + + // Sobel with mask + // ? nppiFilterSobelHorizMaskBorder_32f_C1R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType) + // : nppiFilterSobelVertMaskBorder_32f_C1R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType) + ); + + compute_time += calc_print_elapsed("sobel", start_sobel); + + + auto start_memcpy_dh = now(); + + // Copy result to host + dest.create(height, width, CV_32FC3); + float* pHostDst = (float*) dest.data; + + std::memcpy(pHostDst, pDeviceDst, width * height * elemSize); + + total_time += calc_print_elapsed("std::memcpy H<-D", start_memcpy_dh); + + cudaFree((void*) pDeviceSrc); + cudaFree((void*) pDeviceDst); + + // Only for custom row/col filter + cudaFree((void*) pDeviceKernel); + + std::cout << "[done] sobel" << std::endl; + std::cout << " primary compute time: " << compute_time << " (ms)" << std::endl; + std::cout << " total compute time: " << compute_time + total_time << " (ms)" << std::endl; + } + +} + diff --git a/src/run_dense.cpp b/src/run_dense.cpp index b307080..ac3bdd5 100644 --- a/src/run_dense.cpp +++ b/src/run_dense.cpp @@ -15,6 +15,7 @@ #include "kernels/warmup.h" #include "kernels/pad.h" #include "common/timer.h" +#include "common/cuda_helper.h" using namespace std; @@ -113,6 +114,8 @@ int AutoFirstScaleSelect(int imgwidth, int fratio, int patchsize) { int main( int argc, char** argv ) { + initializeCuda(argc, argv); + // Warmup GPU cu::warmup();