udacity · johannesdeselaers · Aug 4, 2014 · Aug 7, 2014 · Aug 13, 2014 · Aug 13, 2014
diff --git a/.gitignore b/.gitignore
@@ -22,3 +22,4 @@
 build
 bin
 
+*gpusort-ipdps09.pdf
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -39,12 +39,12 @@ if(CUDA_FOUND)
   # add debugging to CUDA NVCC flags.  For NVidia's NSight tools.
   set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")
 
-  add_subdirectory (HW1)
-  add_subdirectory (HW2)
-  add_subdirectory (HW3)
-  add_subdirectory (HW4)
-  add_subdirectory (HW5)
-  add_subdirectory (HW6)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 1)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 2)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 3)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 4)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 5)
+  add_subdirectory (Problem\ Sets/Problem\ Set\ 6)
 else(CUDA_FOUND)
   message("CUDA is not installed on this system.")
 endif()
diff --git a/Problem Sets/Problem Set 1/Makefile b/Problem Sets/Problem Set 1/Makefile
@@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include
 
 OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui
 
-CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
+CUDA_INCLUDEPATH=/usr/local/cuda/include
 
 ######################################################
 # On Macs the default install locations are below    #

diff --git a/Problem Sets/Problem Set 1/student_func.cu b/Problem Sets/Problem Set 1/student_func.cu
@@ -2,11 +2,11 @@
 // Color to Greyscale Conversion
 
 //A common way to represent color images is known as RGBA - the color
-//is specified by how much Red, Grean and Blue is in it.
-//The 'A' stands for Alpha and is used for transparency, it will be
+//is specified by how much Red, Green, and Blue is in it.
+//The 'A' stands for Alpha and is used for transparency; it will be
 //ignored in this homework.
 
-//Each channel Red, Blue, Green and Alpha is represented by one byte.
+//Each channel Red, Blue, Green, and Alpha is represented by one byte.
 //Since we are using one byte for each color there are 256 different
 //possible values for each color.  This means we use 4 bytes per pixel.
 
@@ -15,7 +15,7 @@
 
 //To convert an image from color to grayscale one simple method is to
 //set the intensity to the average of the RGB channels.  But we will
-//use a more sophisticated method that takes into account how the eye 
+//use a more sophisticated method that takes into account how the eye
 //perceives color and weights the channels unequally.
 
 //The eye responds most strongly to green followed by red and then blue.
@@ -24,43 +24,50 @@
 
 //I = .299f * R + .587f * G + .114f * B
 
-//Notice the trailing f's on the numbers which indicate that they are 
+//Notice the trailing f's on the numbers which indicate that they are
 //single precision floating point constants and not double precision
 //constants.
 
 //You should fill in the kernel as well as set the block and grid sizes
 //so that the entire image is processed.
 
+#include "reference_calc.cpp"
 #include "utils.h"
+#include <stdio.h>
 
 __global__
 void rgba_to_greyscale(const uchar4* const rgbaImage,
-                       unsigned char* const greyImage,
-                       int numRows, int numCols)
+unsigned char* const greyImage,
+int numRows, int numCols)
 {
-  //TODO
-  //Fill in the kernel to convert from color to greyscale
-  //the mapping from components of a uchar4 to RGBA is:
-  // .x -> R ; .y -> G ; .z -> B ; .w -> A
-  //
-  //The output (greyImage) at each pixel should be the result of
-  //applying the formula: output = .299f * R + .587f * G + .114f * B;
-  //Note: We will be ignoring the alpha channel for this conversion
-
-  //First create a mapping from the 2D block and grid locations
-  //to an absolute 2D location in the image, then use that to
-  //calculate a 1D offset
+    //TODO
+    //Fill in the kernel to convert from color to greyscale
+    //the mapping from components of a uchar4 to RGBA is:
+    // .x -> R ; .y -> G ; .z -> B ; .w -> A
+    //
+    //The output (greyImage) at each pixel should be the result of
+    //applying the formula: output = .299f * R + .587f * G + .114f * B;
+    //Note: We will be ignoring the alpha channel for this conversion
+
+    //First create a mapping from the 2D block and grid locations
+    //to an absolute 2D location in the image, then use that to
+    //calculate a 1D offset
+    int x = blockIdx.x;
+    int y = blockIdx.y;
+    uchar4 colorValue = rgbaImage[x * numCols + y];
+    float greyValue = 0.299f * colorValue.x + 0.587f * colorValue.y + 0.114f * colorValue.z;
+    greyImage[x * numCols + y] = greyValue;
 }
 
 void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
-                            unsigned char* const d_greyImage, size_t numRows, size_t numCols)
+unsigned char* const d_greyImage, size_t numRows, size_t numCols)
 {
-  //You must fill in the correct sizes for the blockSize and gridSize
-  //currently only one block with one thread is being launched
-  const dim3 blockSize(1, 1, 1);  //TODO
-  const dim3 gridSize( 1, 1, 1);  //TODO
-  rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);
-
-  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
+    //You must fill in the correct sizes for the blockSize and gridSize
+    //currently only one block with one thread is being launched
+    const dim3 threadsPerBlock( 1, 1, 1);  //TODO
+    const dim3 numberOfBlocks( numRows, numCols, 1);  //TODO
+    rgba_to_greyscale<<<numberOfBlocks, threadsPerBlock>>>(d_rgbaImage, d_greyImage, numRows, numCols);
+
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 }
+
diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu
@@ -117,11 +117,11 @@ void gaussian_blur(const unsigned char* const inputChannel,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+  if ( absolute_image_position_x >= numCols ||
+       absolute_image_position_y >= numRows )
+  {
+      return;
+  }
 
   // NOTE: If a thread's absolute position 2D position is within the image, but some of
   // its neighbors are outside the image, then you will need to be extra careful. Instead
@@ -147,11 +147,11 @@ void separateChannels(const uchar4* const inputImageRGBA,
   // the image. You'll want code that performs the following check before accessing
   // GPU memory:
   //
-  // if ( absolute_image_position_x >= numCols ||
-  //      absolute_image_position_y >= numRows )
-  // {
-  //     return;
-  // }
+  if ( absolute_image_position_x >= numCols ||
+        absolute_image_position_y >= numRows )
+  {
+      return;
+  }
 }
 
 //This kernel takes in three color channels and recombines them
@@ -205,6 +205,7 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
   //be sure to use checkCudaErrors like the above examples to
   //be able to tell if anything goes wrong
   //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
+  checkCudaErrors(cudaMalloc);
 
   //TODO:
   //Copy the filter on the host (h_filter) to the memory you just allocated
@@ -221,15 +222,16 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                         const int filterWidth)
 {
   //TODO: Set reasonable block size (i.e., number of threads per block)
-  const dim3 blockSize;
+  const dim3 blockSize( 1, 1, 1);
 
   //TODO:
   //Compute correct grid size (i.e., number of blocks per kernel launch)
   //from the image size and and block size.
-  const dim3 gridSize;
+  const dim3 gridSize( 1, 1, 1);
 
   //TODO: Launch a kernel for separating the RGBA image into different color channels
-
+  seperateChannels<<<gridSize, blockSize>>>(...);
+
   // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

diff --git a/Problem Sets/Problem Set 3/student_func.cu b/Problem Sets/Problem Set 3/student_func.cu
@@ -81,24 +81,152 @@
 
 #include "utils.h"
 
-void your_histogram_and_prefixsum(const float* const d_logLuminance,
-                                  unsigned int* const d_cdf,
-                                  float &min_logLum,
-                                  float &max_logLum,
-                                  const size_t numRows,
-                                  const size_t numCols,
-                                  const size_t numBins)
+__global__ void reduceLuminance(const float* const d_logLuminance, bool isMax, float *result, int inputSize)
 {
-  //TODO
-  /*Here are the steps you need to implement
-    1) find the minimum and maximum value in the input logLuminance channel
-       store in min_logLum and max_logLum
-    2) subtract them to find the range
-    3) generate a histogram of all the values in the logLuminance channel using
-       the formula: bin = (lum[i] - lumMin) / lumRange * numBins
-    4) Perform an exclusive scan (prefix sum) on the histogram to get
-       the cumulative distribution of luminance values (this should go in the
-       incoming d_cdf pointer which already has been allocated for you)       */
+    //  // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
+    extern __shared__ float sdata[];
+
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+    int tid  = threadIdx.x;
+
+    // load shared mem from global mem
+    if(myId < inputSize)
+    {
+        sdata[tid] = d_logLuminance[myId];
+    }
+    __syncthreads();            // make sure entire block is loaded!
+
+    // do reduction in shared mem
+    for (int s = blockDim.x / 2; s > 0; s >>= 1)
+    {
+        if (tid < s && (myId+s) < inputSize)
+        {
+            float val1=sdata[tid];
+            float val2=sdata[tid + s];
+            sdata[tid] = isMax?(val1 > val2 ? val1 : val2) : (val1 < val2 ? val1 : val2);
+        }
+        __syncthreads();        // make sure all adds at one stage are done!
+    }
+
+    // only thread 0 writes result for this block back to global mem
+    if (tid == 0)
+    {
+        result[blockIdx.x] = sdata[0];
+    }
+}
+
+__global__ void assignHistogram(int *d_bins, const float *d_in, int numBins, float lumRange, float lumMin, int inputSize)
+{
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+
+    if(myId<inputSize)
+    {
+        //bin = (lum[i] - lumMin) / lumRange * numBins
+        int myBin = (d_in[myId] - lumMin) / lumRange * numBins;
+        atomicAdd(&(d_bins[myBin]), 1);
+    }
+}
 
 
+__global__ void scanHistogram(const int* const d_bins, unsigned int *result, int inputSize)
+{
+    //  // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
+    extern __shared__ int shareddata[];
+
+    int myId = threadIdx.x + blockDim.x * blockIdx.x;
+    int tid  = threadIdx.x;
+
+    // load shared mem from global mem
+    if(myId < inputSize)
+    {
+        shareddata[tid] = d_bins[myId];
+    }
+    __syncthreads();            // make sure entire block is loaded!
+
+    //Step hillis / Steele
+    for (int step = 1; step<inputSize;step <<= 1)
+    {
+        if (tid >= step)
+        {
+            shareddata[tid] += shareddata[tid - step];
+        }
+        __syncthreads();        // make sure all adds at one stage are done!
+    }
+
+    // every thread writes result for this block back to global mem
+    if(myId < inputSize)
+    {
+        result[myId] = shareddata[tid];
+    }
+}
+
+void your_histogram_and_prefixsum(const float* const d_logLuminance,
+unsigned int* const d_cdf,
+float &min_logLum,
+float &max_logLum,
+const size_t numRows,
+const size_t numCols,
+const size_t numBins)
+{
+    //TODO
+    /*Here are the steps you need to implement
+     1) find the minimum and maximum value in the input logLuminance channel
+     store in min_logLum and max_logLum
+     2) subtract them to find the range
+     3) generate a histogram of all the values in the logLuminance channel using
+     the formula: bin = (lum[i] - lumMin) / lumRange * numBins
+     4) Perform an exclusive scan (prefix sum) on the histogram to get
+     the cumulative distribution of luminance values (this should go in the
+     incoming d_cdf pointer which already has been allocated for you)       */
+
+    // Two step reduce on one dimension
+    const int inputSize = numRows * numCols;
+    const int maxThreadsPerBlock = 1024;
+    unsigned int threads = maxThreadsPerBlock;
+    const int blocks = (inputSize / maxThreadsPerBlock)+2; //more blocks to avoid int round loss
+
+    const int sharedMemorySize_1 = threads * sizeof(float);
+    const int sharedMemorySize_2 = blocks * sizeof(float);
+
+    float *d_intermediate, *d_result;
+    checkCudaErrors(cudaMalloc(&d_intermediate, sizeof(float) * blocks));
+    checkCudaErrors(cudaMalloc(&d_result, sizeof(float)));
+
+    // Step 1: min luminance
+    reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, false, d_intermediate, inputSize);
+    reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, false, d_result, blocks);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaMemcpy(&min_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));
+
+    // Step 1: max luminance
+    reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, true, d_intermediate, inputSize);
+    reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, true, d_result, blocks);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+    checkCudaErrors(cudaMemcpy(&max_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));
+
+    // Step 2: luminance range
+    float range = max_logLum - min_logLum;
+
+    // Step 3: histogram
+    int *d_bins;
+    checkCudaErrors(cudaMalloc(&d_bins, sizeof(int) * numBins));
+    checkCudaErrors(cudaMemset(d_bins, 0, sizeof(int) * numBins));
+    assignHistogram<<<(inputSize/threads) +1, threads>>>(d_bins, d_logLuminance, numBins, range, min_logLum, inputSize);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    // Step 4: Exclusive Scan on d_bins
+    threads = 2;
+    while(threads < numBins)
+    {
+        threads <<= 1;
+    }
+
+    scanHistogram<<<1, threads, sizeof(unsigned int) * numBins>>>(d_bins, d_cdf, numBins);
+    cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+
+    //free resources
+
+    cudaFree(d_intermediate);
+    cudaFree(d_result);
+    cudaFree(d_bins);
 }