From de459930f95fa2f94147801740e89760f432bb09 Mon Sep 17 00:00:00 2001 From: Kaushik-Iyer Date: Mon, 2 Dec 2024 17:12:04 -0800 Subject: [PATCH] greedy cuda implementation + updated comparisons utils file --- Makefile | 7 +- algorithms/greedy_cuda.cu | 195 ++++++++++++++++++++++++++++++++++++++ utils/comparisons.py | 68 ++++++++----- 3 files changed, 245 insertions(+), 25 deletions(-) create mode 100644 algorithms/greedy_cuda.cu diff --git a/Makefile b/Makefile index d45c7d6..c9b68d3 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ CPP=g++ -NVCC=nvcc +NVCC= nvcc -arch=sm_70 +NVCCFLAGS=-DCUDA CFLAGS=-lm OPTFLAGS=-O3 @@ -19,6 +20,7 @@ greedy: $(BUILD_DIR)/greedy genetic: $(BUILD_DIR)/genetic cu_genetic: $(BUILD_DIR)/cu_genetic dp_omp: $(BUILD_DIR)/dp_omp +greedy_cuda: $(BUILD_DIR)/greedy_cuda $(BUILD_DIR)/brute: $(SRC_DIR)/main.cpp $(ALGO_DIR)/brute.cpp $(CPP) $^ -o $@ $(CFLAGS) $(OPTFLAGS) @@ -38,6 +40,9 @@ $(BUILD_DIR)/cu_genetic: $(SRC_DIR)/main.cpp $(ALGO_DIR)/genetic.cu $(BUILD_DIR)/dp_omp: $(SRC_DIR)/main.cpp $(ALGO_DIR)/dp_omp.cpp $(CPP) $^ -o $@ $(CFLAGS_DP) $(OPTFLAGS_DP) +$(BUILD_DIR)/greedy_cuda: $(SRC_DIR)/main.cpp $(ALGO_DIR)/greedy_cuda.cu + $(NVCC) $^ -o $@ $(NVCCFLAGS) + .PHONY: clean clean: diff --git a/algorithms/greedy_cuda.cu b/algorithms/greedy_cuda.cu new file mode 100644 index 0000000..bbf3f00 --- /dev/null +++ b/algorithms/greedy_cuda.cu @@ -0,0 +1,195 @@ +#include +#include +#include +#include +#include "../common/algorithms.hpp" + +const int BLOCK_SIZE = 256; + +__global__ void findNearestCityKernel( + const double* coords_x, // Array of x coordinates + const double* coords_y, // Array of y coordinates + const char* visited, // Array of visited cities (1 if visited, 0 if not) + int current_city, // idx of the current city + int n, // Total number of cities + double* min_distances, //Output: minimum distance to each city + int* next_cities // Output: next city to visit +) { + __shared__ double shared_min_distances[BLOCK_SIZE]; // Shared memory for minimum distances + __shared__ int shared_next_cities[BLOCK_SIZE]; // Shared memory for next cities + + int tid = threadIdx.x; + int gid = blockIdx.x * blockDim.x + threadIdx.x; + + // Load current city coordinates into shared memory + __shared__ double current_coords[2]; + if (tid == 0) { // Only the first thread in the block loads the current city coordinates + current_coords[0] = coords_x[current_city]; + current_coords[1] = coords_y[current_city]; + } + __syncthreads(); // Ensures all threads wait until the coordinates are loaded + + double local_min = INFINITY; + int local_next = -1; + + // Each thread maintains its own minimum distance and corresponding city + for (int j = gid; j < n; j += blockDim.x * gridDim.x) { //blockDim.x * gridDim.x is the total number of threads + if (!visited[j]) { // If the city has not been visited + double dx = current_coords[0] - coords_x[j]; + double dy = current_coords[1] - coords_y[j]; + double dist = sqrt(dx * dx + dy * dy); + if (dist < local_min) { // If the distance is less than the local minimum + local_min = dist; + local_next = j; + } + } + } + + // For example, if n = 1000, and we have 256 threads, then each thread will have to calculate 4 cities + + shared_min_distances[tid] = local_min; + shared_next_cities[tid] = local_next; + __syncthreads(); // Ensures all threads wait until the minimum distances and next cities are loaded + + // Block that performs parallel reduction to find the global minimum distance and corresponding city + + // Explanation of what is parallel reduction: + /* In parallel reduction, we are trying to find the minimum distance and the corresponding city from the shared memory + array shared_min_distances. for example, if we have 256 threads, then we have 256 elements in shared_min_distances. + We are trying to find the minimum distance and the corresponding city from these 256 elements. + the stride initially is 128, then 64, then 32, then 16, then 8, then 4, then 2, then 1. + so we are comparing the elements at index 0 and 128 (along with 1 and 129, 2 and 130 etc etc), then 0 and 64, then 0 and 32, then 0 and 16, then 0 and 8, then 0 and 4, then 0 and 2, then 0 and 1. + and we are updating the minimum distance and the corresponding city accordingly. + */ + for (int stride = BLOCK_SIZE/2; stride > 0; stride >>= 1) { + if (tid < stride) { + if (shared_min_distances[tid + stride] < shared_min_distances[tid]) { + shared_min_distances[tid] = shared_min_distances[tid + stride]; + shared_next_cities[tid] = shared_next_cities[tid + stride]; + } + } + __syncthreads(); + } + + // After the parallel reduction, the minimum distance and the corresponding city will be at index 0 + if (tid == 0) { + min_distances[blockIdx.x] = shared_min_distances[0]; + next_cities[blockIdx.x] = shared_next_cities[0]; + } + + /* Benefits of parallel reduction: + 1. Reduces complexity from O(n) to O(log(n)) + 2. Reduces the number of global memory accesses + */ +} + +TSPResult solve(const std::vector>& coordinates) { + int n = coordinates.size(); + + // Calculate optimal number of blocks + const int threadsPerBlock = BLOCK_SIZE; + const int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock; + + // Pinned memory: memory that is not swapped out to disk optimizes memory transfer between CPU and GPU + // Putting x and y coordinates into pinned memory + double *h_coords_x, *h_coords_y; + cudaMallocHost(&h_coords_x, n * sizeof(double)); + cudaMallocHost(&h_coords_y, n * sizeof(double)); + + for (int i = 0; i < n; i++) { + h_coords_x[i] = coordinates[i].first; + h_coords_y[i] = coordinates[i].second; + } + + /* Allocating GPU memory for coordaiantes. + Copying data from CPU to GPU*/ + double *d_coords_x, *d_coords_y; + cudaMalloc(&d_coords_x, n * sizeof(double)); + cudaMalloc(&d_coords_y, n * sizeof(double)); + + // Copy coordinates to device + cudaMemcpy(d_coords_x, h_coords_x, n * sizeof(double), cudaMemcpyHostToDevice); + cudaMemcpy(d_coords_y, h_coords_y, n * sizeof(double), cudaMemcpyHostToDevice); + + // Initialize path variables + std::vector path; + double total_cost = 0.0; + std::vector visited(n, 0); + + // Allocate device memory for visited array and results + char* d_visited; + cudaMalloc(&d_visited, n * sizeof(char)); + + double* d_min_distances; + int* d_next_cities; + cudaMalloc(&d_min_distances, numBlocks * sizeof(double)); // Changed from BLOCK_SIZE to numBlocks + cudaMalloc(&d_next_cities, numBlocks * sizeof(int)); + + // Start from city 0 + int current_city = 0; + path.push_back(current_city); + visited[current_city] = 1; + cudaMemcpy(d_visited, visited.data(), n * sizeof(char), cudaMemcpyHostToDevice); + + // Main loop + while (path.size() < n) { + // Launch kernel with calculated dimensions + findNearestCityKernel<<>>( + d_coords_x, + d_coords_y, + d_visited, + current_city, + n, + d_min_distances, + d_next_cities + ); + + // Check for kernel execution errors + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) { + printf("Kernel launch error: %s\n", cudaGetErrorString(err)); + } + + // Copy results back to host + std::vector h_min_distances(numBlocks); // Changed from BLOCK_SIZE to numBlocks + std::vector h_next_cities(numBlocks); + cudaMemcpy(h_min_distances.data(), d_min_distances, numBlocks * sizeof(double), cudaMemcpyDeviceToHost); + cudaMemcpy(h_next_cities.data(), d_next_cities, numBlocks * sizeof(int), cudaMemcpyDeviceToHost); + + // Find global minimum across all blocks + double min_distance = INFINITY; + int next_city = -1; + for (int i = 0; i < numBlocks; i++) { + if (h_min_distances[i] < min_distance) { + min_distance = h_min_distances[i]; + next_city = h_next_cities[i]; + } + } + + // Update path + current_city = next_city; + path.push_back(current_city); + visited[current_city] = 1; + cudaMemcpy(d_visited, visited.data(), n * sizeof(char), cudaMemcpyHostToDevice); + total_cost += min_distance; + } + + // Add return to start cost + double dx = h_coords_x[path.back()] - h_coords_x[path[0]]; + double dy = h_coords_y[path.back()] - h_coords_y[path[0]]; + total_cost += sqrt(dx * dx + dy * dy); + + // Cleaning up all the variables that we have used + cudaFreeHost(h_coords_x); + cudaFreeHost(h_coords_y); + cudaFree(d_coords_x); + cudaFree(d_coords_y); + cudaFree(d_visited); + cudaFree(d_min_distances); + cudaFree(d_next_cities); + + TSPResult result; + result.cost = total_cost; + result.path = path; + return result; +} \ No newline at end of file diff --git a/utils/comparisons.py b/utils/comparisons.py index 3032e20..b5a4973 100644 --- a/utils/comparisons.py +++ b/utils/comparisons.py @@ -3,50 +3,76 @@ import time import argparse import matplotlib.pyplot as plt +import numpy as np def run_implementation(exec_path: str, dataset: str) -> float: - """Run implementation with dataset and return execution time""" try: start = time.time() result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'], stdout=subprocess.PIPE, stderr=subprocess.PIPE) - end = time.time() - return end - start + return time.time() - start except Exception as e: print(f"Error running {exec_path} with {dataset}: {e}") return -1 def plot_results(results): - """Plot timing results for all implementations""" - plt.figure(figsize=(12, 8)) + """Plot timing results with multiple views""" + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) colors = { 'brute': 'b', 'greedy': 'g', 'genetic': 'r', 'dp': 'purple', - 'greedy_omp': 'y' + 'greedy_cuda': 'y' } datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] x_pos = range(len(datasets)) + # Plot 1: Log scale (good for small differences) for impl, timings in results.items(): - if timings: # Only plot if we have data - plt.plot(x_pos, [timings[d] for d in datasets], + if timings: + ax1.plot(x_pos, [timings[d] for d in datasets], f'{colors[impl]}-o', - label=f'{impl.capitalize()} Implementation') + label=f'{impl.capitalize()}') + ax1.set_yscale('log') + ax1.set_title('Log Scale Comparison') + ax1.set_xticks(x_pos) + ax1.set_xticklabels(datasets) + ax1.grid(True) + ax1.legend() - plt.xticks(x_pos, datasets) - plt.xlabel('Dataset Size') - plt.ylabel('Execution Time (seconds)') - plt.title('TSP Implementation Performance Comparison') - plt.legend() - plt.grid(True) - plt.yscale('log') # Log scale for better visibility + # Plot 2: Linear scale (good for large differences) + for impl, timings in results.items(): + if timings: + ax2.plot(x_pos, [timings[d] for d in datasets], + f'{colors[impl]}-o', + label=f'{impl.capitalize()}') + ax2.set_title('Linear Scale Comparison') + ax2.set_xticks(x_pos) + ax2.set_xticklabels(datasets) + ax2.grid(True) + + # Plot 3: Speedup relative to baseline (greedy) + if 'greedy' in results: + baseline = results['greedy'] + for impl, timings in results.items(): + if impl != 'greedy' and timings: + speedups = [baseline[d]/timings[d] for d in datasets] + ax3.plot(x_pos, speedups, + f'{colors[impl]}-o', + label=f'{impl.capitalize()}') + ax3.axhline(y=1.0, color='k', linestyle='--') + ax3.set_title('Speedup vs. Greedy') + ax3.set_xticks(x_pos) + ax3.set_xticklabels(datasets) + ax3.grid(True) + ax3.legend() - plt.savefig('comparison_results.png') + plt.tight_layout() + plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight') print("Plot saved as comparison_results.png") try: @@ -55,7 +81,7 @@ def plot_results(results): print(f"Could not display plot: {e}") if __name__ == "__main__": - all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_omp'] + all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda'] parser = argparse.ArgumentParser(description='Compare TSP implementations') parser.add_argument('implementations', @@ -64,14 +90,10 @@ def plot_results(results): help='Implementations to compare. If none specified, runs all.') args = parser.parse_args() - - # Select implementations to run implementations = args.implementations if args.implementations else all_implementations datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] results = {} - - # Run tests for each implementation and dataset for impl in implementations: results[impl] = {} exec_path = f'./build/{impl}' @@ -79,10 +101,8 @@ def plot_results(results): for dataset in datasets: print(f"Running {impl} on {dataset} dataset...") execution_time = run_implementation(exec_path, dataset) - if execution_time >= 0: results[impl][dataset] = execution_time print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds") - # Plot results plot_results(results) \ No newline at end of file