Skip to content

Commit

Permalink
Add unified memory setup
Browse files Browse the repository at this point in the history
  • Loading branch information
Richard Zhao committed May 10, 2017
1 parent 11e7b62 commit e9c83cd
Show file tree
Hide file tree
Showing 4 changed files with 187 additions and 11 deletions.
24 changes: 13 additions & 11 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,8 @@ set(KERNELS
kernels/interpolate.cu
kernels/extract.cu
kernels/warmup.cpp
kernels/sobel.cpp
# kernels/sobel.cpp
kernels/sobelUnifiedMem.cpp
kernels/pyramid.cpp
kernels/pad.cpp
kernels/resizeGrad.cpp
Expand Down Expand Up @@ -109,13 +110,14 @@ set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=4") #
# set_property(TARGET flow APPEND PROPERTY COMPILE_DEFINITIONS "VECTOR_WIDTH=1") # no SIMD
target_link_libraries(flow ${OpenCV_LIBS})

# # CUDA sandbox
# set(SANDBOX_FILES
# # sandbox/process_sobel.cpp
# sandbox/process_resize.cpp
# # sandbox/process_resizeGrad.cpp
# # sandbox/process_pad.cpp
# # sandbox/RgbMatTest.cpp
# sandbox/sandbox.cpp)
# cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
# target_link_libraries(sandbox ${OpenCV_LIBS})
# CUDA sandbox
set(SANDBOX_FILES
# sandbox/process_sobel.cpp
sandbox/process_sobel.cpp
# sandbox/process_resize.cpp
# sandbox/process_resizeGrad.cpp
# sandbox/process_pad.cpp
# sandbox/RgbMatTest.cpp
sandbox/sandbox.cpp)
cuda_add_executable(sandbox ${COMMON} ${KERNELS} ${SANDBOX_FILES})
target_link_libraries(sandbox ${OpenCV_LIBS})
7 changes: 7 additions & 0 deletions src/common/cuda_helper.h
Original file line number Diff line number Diff line change
Expand Up @@ -517,6 +517,13 @@ inline void initializeCuda(int argc, char** argv) {
//Get GPU information
checkCudaErrors(cudaGetDevice(&devID));
checkCudaErrors(cudaGetDeviceProperties(&props, devID));

if (props.canMapHostMemory) {
if (props.integrated)
printf("Device is an integrated GPU\n");
printf("Device supports zero copy, enabling page locked memory mapping\n\n");
checkCudaErrors( cudaSetDeviceFlags(cudaDeviceMapHost) );
}
}

#endif // end __CUDA_HELPER_H__
164 changes: 164 additions & 0 deletions src/kernels/sobelUnifiedMem.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
/**
* Implements a sobel kernel
*/

// System
#include <iostream>
#include <chrono>
#include <string>
#include <stdexcept>
#include <cstring>

// OpenCV
#include <opencv2/opencv.hpp>

// CUDA
#include <cuda.h>
#include <cuda_runtime.h>

// NVIDIA Perf Primitives
#include <nppi.h>
#include <nppi_filtering_functions.h>

// Local
#include "../common/timer.h"

#include "sobel.h"

using namespace timer;

namespace cu {

/**
* Perform a sobel filter on src and store it in dest.
* dest must be allocated.
* Accepts 3-channel 32-bit float matrices.
*
* Params:
* src input image.
* dst output image of the same size and the same number of channels as src .
* ddepth output image depth.
* dx order of the derivative x. Only 0, 1 supported.
* dy order of the derivative y. Only 0, 1 supported.
* ksize (unused) size of the extended Sobel kernel; it must be 1, 3, 5, or 7.
* scale (unused) optional scale factor for the computed derivative values
* delta (optional|unused) delta value that is added to the results prior to storing them in dst.
* borderType (unused) pixel extrapolation method, see cv::BorderTypes
*/
void sobel(
const cv::Mat& src, cv::Mat& dest, int ddepth, int dx, int dy,
int ksize, double scale, double delta, int borderType) {

if (src.type() != CV_32FC3) {
throw std::invalid_argument("sobel: invalid input matrix type");
}

if ( !((dx == 1 && dy == 0) ||
(dx == 0 && dy == 1)) ) {
throw std::invalid_argument("sobel: only accepts first order derivatives");
}

// Compute time of relevant kernel
double compute_time = 0.0;
double total_time = 0.0;

// CV_32FC3 is made up of RGB floats
int channels = 3;
size_t elemSize = 3 * sizeof(float);

cv::Size sz = src.size();
int width = sz.width;
int height = sz.height;

std::cout << "[start] sobel: processing " << width << "x" << height << " image" << std::endl;

// pSrc pointer to image data
Npp32f* pHostSrc = (float*) src.data;

// The width, in bytes, of the image, sometimes referred to as pitch
unsigned int nSrcStep = width * elemSize;
unsigned int nDstStep = nSrcStep;

NppiSize oSrcSize = { width, height };
NppiPoint oSrcOffset = { 0, 0 };
NppiSize oSizeROI = { width, height };

// For 1D convolution
const Npp32f pKernel[3] = { 1, 0, -1 };
Npp32s nMaskSize = 3;
Npp32s nAnchor = 1; // Kernel is centered over pixel
NppiBorderType eBorderType = NPP_BORDER_REPLICATE;

auto start_cuda_malloc = now();

// Allocate device memory
Npp32f* pDeviceSrc, *pDeviceDst;

checkCudaErrors( cudaHostAlloc((void**) &pDeviceSrc, width * height * elemSize, cudaHostAllocMapped) );
checkCudaErrors( cudaHostAlloc((void**) &pDeviceDst, width * height * elemSize, cudaHostAllocMapped) );

// For custom row/col kernel
Npp32f* pDeviceKernel;
checkCudaErrors( cudaHostAlloc((void**) &pDeviceKernel, nMaskSize * sizeof(Npp32f), cudaHostAllocMapped) );

calc_print_elapsed("cudaHostAlloc", start_cuda_malloc);


auto start_memcpy_hd = now();

std::memcpy(pDeviceSrc, pHostSrc, width * height * elemSize);
std::memcpy(pDeviceKernel, pKernel, nMaskSize * sizeof(Npp32f));

calc_print_elapsed("std::memcpy D->H", start_memcpy_hd);


bool useHoriz = (dx == 1);

auto start_sobel = now();

NPP_CHECK_NPP(
(useHoriz)
// For built in sobel
// ? nppiFilterPrewittHorizBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType)
// : nppiFilterPrewittVertBorder_32f_C3R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, eBorderType)

// Custom row filter
? nppiFilterRowBorder_32f_C3R (
pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType)

: nppiFilterColumnBorder_32f_C3R (
pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset,
pDeviceDst, nDstStep, oSizeROI, pDeviceKernel, nMaskSize, nAnchor, eBorderType)

// Sobel with mask
// ? nppiFilterSobelHorizMaskBorder_32f_C1R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType)
// : nppiFilterSobelVertMaskBorder_32f_C1R (pDeviceSrc, nSrcStep, oSrcSize, oSrcOffset, pDeviceDst, nDstStep, oSizeROI, NPP_MASK_SIZE_1_X_3, eBorderType)
);

compute_time += calc_print_elapsed("sobel", start_sobel);


auto start_memcpy_dh = now();

// Copy result to host
dest.create(height, width, CV_32FC3);
float* pHostDst = (float*) dest.data;

std::memcpy(pHostDst, pDeviceDst, width * height * elemSize);

total_time += calc_print_elapsed("std::memcpy H<-D", start_memcpy_dh);

cudaFree((void*) pDeviceSrc);
cudaFree((void*) pDeviceDst);

// Only for custom row/col filter
cudaFree((void*) pDeviceKernel);

std::cout << "[done] sobel" << std::endl;
std::cout << " primary compute time: " << compute_time << " (ms)" << std::endl;
std::cout << " total compute time: " << compute_time + total_time << " (ms)" << std::endl;
}

}

3 changes: 3 additions & 0 deletions src/run_dense.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#include "kernels/warmup.h"
#include "kernels/pad.h"
#include "common/timer.h"
#include "common/cuda_helper.h"


using namespace std;
Expand Down Expand Up @@ -113,6 +114,8 @@ int AutoFirstScaleSelect(int imgwidth, int fratio, int patchsize) {

int main( int argc, char** argv ) {

initializeCuda(argc, argv);

// Warmup GPU
cu::warmup();

Expand Down

0 comments on commit e9c83cd

Please sign in to comment.