Skip to content
This repository has been archived by the owner on Jun 27, 2022. It is now read-only.

updated directory paths in CMakeList #17

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,4 @@
build
bin

*gpusort-ipdps09.pdf
12 changes: 6 additions & 6 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,12 @@ if(CUDA_FOUND)
# add debugging to CUDA NVCC flags. For NVidia's NSight tools.
set(CUDA_NVCC_FLAGS_DEBUG ${CUDA_NVCC_FLAGS_DEBUG} "-G")

add_subdirectory (HW1)
add_subdirectory (HW2)
add_subdirectory (HW3)
add_subdirectory (HW4)
add_subdirectory (HW5)
add_subdirectory (HW6)
add_subdirectory (Problem\ Sets/Problem\ Set\ 1)
add_subdirectory (Problem\ Sets/Problem\ Set\ 2)
add_subdirectory (Problem\ Sets/Problem\ Set\ 3)
add_subdirectory (Problem\ Sets/Problem\ Set\ 4)
add_subdirectory (Problem\ Sets/Problem\ Set\ 5)
add_subdirectory (Problem\ Sets/Problem\ Set\ 6)
else(CUDA_FOUND)
message("CUDA is not installed on this system.")
endif()
2 changes: 1 addition & 1 deletion Problem Sets/Problem Set 1/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ OPENCV_INCLUDEPATH=/usr/include

OPENCV_LIBS=-lopencv_core -lopencv_imgproc -lopencv_highgui

CUDA_INCLUDEPATH=/usr/local/cuda-5.0/include
CUDA_INCLUDEPATH=/usr/local/cuda/include

######################################################
# On Macs the default install locations are below #
Expand Down
63 changes: 35 additions & 28 deletions Problem Sets/Problem Set 1/student_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
// Color to Greyscale Conversion

//A common way to represent color images is known as RGBA - the color
//is specified by how much Red, Grean and Blue is in it.
//The 'A' stands for Alpha and is used for transparency, it will be
//is specified by how much Red, Green, and Blue is in it.
//The 'A' stands for Alpha and is used for transparency; it will be
//ignored in this homework.

//Each channel Red, Blue, Green and Alpha is represented by one byte.
//Each channel Red, Blue, Green, and Alpha is represented by one byte.
//Since we are using one byte for each color there are 256 different
//possible values for each color. This means we use 4 bytes per pixel.

Expand All @@ -15,7 +15,7 @@

//To convert an image from color to grayscale one simple method is to
//set the intensity to the average of the RGB channels. But we will
//use a more sophisticated method that takes into account how the eye
//use a more sophisticated method that takes into account how the eye
//perceives color and weights the channels unequally.

//The eye responds most strongly to green followed by red and then blue.
Expand All @@ -24,43 +24,50 @@

//I = .299f * R + .587f * G + .114f * B

//Notice the trailing f's on the numbers which indicate that they are
//Notice the trailing f's on the numbers which indicate that they are
//single precision floating point constants and not double precision
//constants.

//You should fill in the kernel as well as set the block and grid sizes
//so that the entire image is processed.

#include "reference_calc.cpp"
#include "utils.h"
#include <stdio.h>

__global__
void rgba_to_greyscale(const uchar4* const rgbaImage,
unsigned char* const greyImage,
int numRows, int numCols)
unsigned char* const greyImage,
int numRows, int numCols)
{
//TODO
//Fill in the kernel to convert from color to greyscale
//the mapping from components of a uchar4 to RGBA is:
// .x -> R ; .y -> G ; .z -> B ; .w -> A
//
//The output (greyImage) at each pixel should be the result of
//applying the formula: output = .299f * R + .587f * G + .114f * B;
//Note: We will be ignoring the alpha channel for this conversion

//First create a mapping from the 2D block and grid locations
//to an absolute 2D location in the image, then use that to
//calculate a 1D offset
//TODO
//Fill in the kernel to convert from color to greyscale
//the mapping from components of a uchar4 to RGBA is:
// .x -> R ; .y -> G ; .z -> B ; .w -> A
//
//The output (greyImage) at each pixel should be the result of
//applying the formula: output = .299f * R + .587f * G + .114f * B;
//Note: We will be ignoring the alpha channel for this conversion

//First create a mapping from the 2D block and grid locations
//to an absolute 2D location in the image, then use that to
//calculate a 1D offset
int x = blockIdx.x;
int y = blockIdx.y;
uchar4 colorValue = rgbaImage[x * numCols + y];
float greyValue = 0.299f * colorValue.x + 0.587f * colorValue.y + 0.114f * colorValue.z;
greyImage[x * numCols + y] = greyValue;
}

void your_rgba_to_greyscale(const uchar4 * const h_rgbaImage, uchar4 * const d_rgbaImage,
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
unsigned char* const d_greyImage, size_t numRows, size_t numCols)
{
//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 blockSize(1, 1, 1); //TODO
const dim3 gridSize( 1, 1, 1); //TODO
rgba_to_greyscale<<<gridSize, blockSize>>>(d_rgbaImage, d_greyImage, numRows, numCols);

cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

//You must fill in the correct sizes for the blockSize and gridSize
//currently only one block with one thread is being launched
const dim3 threadsPerBlock( 1, 1, 1); //TODO
const dim3 numberOfBlocks( numRows, numCols, 1); //TODO
rgba_to_greyscale<<<numberOfBlocks, threadsPerBlock>>>(d_rgbaImage, d_greyImage, numRows, numCols);

cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
}

28 changes: 15 additions & 13 deletions Problem Sets/Problem Set 2/student_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -117,11 +117,11 @@ void gaussian_blur(const unsigned char* const inputChannel,
// the image. You'll want code that performs the following check before accessing
// GPU memory:
//
// if ( absolute_image_position_x >= numCols ||
// absolute_image_position_y >= numRows )
// {
// return;
// }
if ( absolute_image_position_x >= numCols ||
absolute_image_position_y >= numRows )
{
return;
}

// NOTE: If a thread's absolute position 2D position is within the image, but some of
// its neighbors are outside the image, then you will need to be extra careful. Instead
Expand All @@ -147,11 +147,11 @@ void separateChannels(const uchar4* const inputImageRGBA,
// the image. You'll want code that performs the following check before accessing
// GPU memory:
//
// if ( absolute_image_position_x >= numCols ||
// absolute_image_position_y >= numRows )
// {
// return;
// }
if ( absolute_image_position_x >= numCols ||
absolute_image_position_y >= numRows )
{
return;
}
}

//This kernel takes in three color channels and recombines them
Expand Down Expand Up @@ -205,6 +205,7 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
//be sure to use checkCudaErrors like the above examples to
//be able to tell if anything goes wrong
//IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
checkCudaErrors(cudaMalloc);

//TODO:
//Copy the filter on the host (h_filter) to the memory you just allocated
Expand All @@ -221,15 +222,16 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
const int filterWidth)
{
//TODO: Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize;
const dim3 blockSize( 1, 1, 1);

//TODO:
//Compute correct grid size (i.e., number of blocks per kernel launch)
//from the image size and and block size.
const dim3 gridSize;
const dim3 gridSize( 1, 1, 1);

//TODO: Launch a kernel for separating the RGBA image into different color channels

seperateChannels<<<gridSize, blockSize>>>(...);

// Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
Expand Down
162 changes: 145 additions & 17 deletions Problem Sets/Problem Set 3/student_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -81,24 +81,152 @@

#include "utils.h"

void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
__global__ void reduceLuminance(const float* const d_logLuminance, bool isMax, float *result, int inputSize)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */
// // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ float sdata[];

int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

// load shared mem from global mem
if(myId < inputSize)
{
sdata[tid] = d_logLuminance[myId];
}
__syncthreads(); // make sure entire block is loaded!

// do reduction in shared mem
for (int s = blockDim.x / 2; s > 0; s >>= 1)
{
if (tid < s && (myId+s) < inputSize)
{
float val1=sdata[tid];
float val2=sdata[tid + s];
sdata[tid] = isMax?(val1 > val2 ? val1 : val2) : (val1 < val2 ? val1 : val2);
}
__syncthreads(); // make sure all adds at one stage are done!
}

// only thread 0 writes result for this block back to global mem
if (tid == 0)
{
result[blockIdx.x] = sdata[0];
}
}

__global__ void assignHistogram(int *d_bins, const float *d_in, int numBins, float lumRange, float lumMin, int inputSize)
{
int myId = threadIdx.x + blockDim.x * blockIdx.x;

if(myId<inputSize)
{
//bin = (lum[i] - lumMin) / lumRange * numBins
int myBin = (d_in[myId] - lumMin) / lumRange * numBins;
atomicAdd(&(d_bins[myBin]), 1);
}
}


__global__ void scanHistogram(const int* const d_bins, unsigned int *result, int inputSize)
{
// // sdata is allocated in the kernel call: 3rd arg to <<<b, t, shmem>>>
extern __shared__ int shareddata[];

int myId = threadIdx.x + blockDim.x * blockIdx.x;
int tid = threadIdx.x;

// load shared mem from global mem
if(myId < inputSize)
{
shareddata[tid] = d_bins[myId];
}
__syncthreads(); // make sure entire block is loaded!

//Step hillis / Steele
for (int step = 1; step<inputSize;step <<= 1)
{
if (tid >= step)
{
shareddata[tid] += shareddata[tid - step];
}
__syncthreads(); // make sure all adds at one stage are done!
}

// every thread writes result for this block back to global mem
if(myId < inputSize)
{
result[myId] = shareddata[tid];
}
}

void your_histogram_and_prefixsum(const float* const d_logLuminance,
unsigned int* const d_cdf,
float &min_logLum,
float &max_logLum,
const size_t numRows,
const size_t numCols,
const size_t numBins)
{
//TODO
/*Here are the steps you need to implement
1) find the minimum and maximum value in the input logLuminance channel
store in min_logLum and max_logLum
2) subtract them to find the range
3) generate a histogram of all the values in the logLuminance channel using
the formula: bin = (lum[i] - lumMin) / lumRange * numBins
4) Perform an exclusive scan (prefix sum) on the histogram to get
the cumulative distribution of luminance values (this should go in the
incoming d_cdf pointer which already has been allocated for you) */

// Two step reduce on one dimension
const int inputSize = numRows * numCols;
const int maxThreadsPerBlock = 1024;
unsigned int threads = maxThreadsPerBlock;
const int blocks = (inputSize / maxThreadsPerBlock)+2; //more blocks to avoid int round loss

const int sharedMemorySize_1 = threads * sizeof(float);
const int sharedMemorySize_2 = blocks * sizeof(float);

float *d_intermediate, *d_result;
checkCudaErrors(cudaMalloc(&d_intermediate, sizeof(float) * blocks));
checkCudaErrors(cudaMalloc(&d_result, sizeof(float)));

// Step 1: min luminance
reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, false, d_intermediate, inputSize);
reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, false, d_result, blocks);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(&min_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));

// Step 1: max luminance
reduceLuminance<<<blocks, threads, sharedMemorySize_1>>>(d_logLuminance, true, d_intermediate, inputSize);
reduceLuminance<<<1, blocks, sharedMemorySize_2>>>(d_intermediate, true, d_result, blocks);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
checkCudaErrors(cudaMemcpy(&max_logLum, d_result, sizeof(float), cudaMemcpyDeviceToHost));

// Step 2: luminance range
float range = max_logLum - min_logLum;

// Step 3: histogram
int *d_bins;
checkCudaErrors(cudaMalloc(&d_bins, sizeof(int) * numBins));
checkCudaErrors(cudaMemset(d_bins, 0, sizeof(int) * numBins));
assignHistogram<<<(inputSize/threads) +1, threads>>>(d_bins, d_logLuminance, numBins, range, min_logLum, inputSize);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

// Step 4: Exclusive Scan on d_bins
threads = 2;
while(threads < numBins)
{
threads <<= 1;
}

scanHistogram<<<1, threads, sizeof(unsigned int) * numBins>>>(d_bins, d_cdf, numBins);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

//free resources

cudaFree(d_intermediate);
cudaFree(d_result);
cudaFree(d_bins);
}
Loading