From de459930f95fa2f94147801740e89760f432bb09 Mon Sep 17 00:00:00 2001
From: Kaushik-Iyer <kushlaser@gmail.com>
Date: Mon, 2 Dec 2024 17:12:04 -0800
Subject: [PATCH] greedy cuda implementation + updated comparisons utils file

---
 Makefile                  |   7 +-
 algorithms/greedy_cuda.cu | 195 ++++++++++++++++++++++++++++++++++++++
 utils/comparisons.py      |  68 ++++++++-----
 3 files changed, 245 insertions(+), 25 deletions(-)
 create mode 100644 algorithms/greedy_cuda.cu

diff --git a/Makefile b/Makefile
index d45c7d6..c9b68d3 100644
--- a/Makefile
+++ b/Makefile
@@ -1,5 +1,6 @@
 CPP=g++
-NVCC=nvcc
+NVCC= nvcc -arch=sm_70
+NVCCFLAGS=-DCUDA
 CFLAGS=-lm
 OPTFLAGS=-O3 
 
@@ -19,6 +20,7 @@ greedy: $(BUILD_DIR)/greedy
 genetic: $(BUILD_DIR)/genetic
 cu_genetic: $(BUILD_DIR)/cu_genetic
 dp_omp: $(BUILD_DIR)/dp_omp
+greedy_cuda: $(BUILD_DIR)/greedy_cuda
 
 $(BUILD_DIR)/brute: $(SRC_DIR)/main.cpp $(ALGO_DIR)/brute.cpp
 	$(CPP) $^ -o $@ $(CFLAGS) $(OPTFLAGS)
@@ -38,6 +40,9 @@ $(BUILD_DIR)/cu_genetic: $(SRC_DIR)/main.cpp $(ALGO_DIR)/genetic.cu
 $(BUILD_DIR)/dp_omp: $(SRC_DIR)/main.cpp $(ALGO_DIR)/dp_omp.cpp
 	$(CPP) $^ -o $@ $(CFLAGS_DP) $(OPTFLAGS_DP)
 
+$(BUILD_DIR)/greedy_cuda: $(SRC_DIR)/main.cpp $(ALGO_DIR)/greedy_cuda.cu
+	$(NVCC) $^ -o $@ $(NVCCFLAGS)
+
 .PHONY: clean
 
 clean:
diff --git a/algorithms/greedy_cuda.cu b/algorithms/greedy_cuda.cu
new file mode 100644
index 0000000..bbf3f00
--- /dev/null
+++ b/algorithms/greedy_cuda.cu
@@ -0,0 +1,195 @@
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "../common/algorithms.hpp"
+
+const int BLOCK_SIZE = 256;
+
+__global__ void findNearestCityKernel(
+    const double* coords_x, // Array of x coordinates
+    const double* coords_y, // Array of y coordinates
+    const char* visited, // Array of visited cities (1 if visited, 0 if not)
+    int current_city, // idx of the current city
+    int n, // Total number of cities
+    double* min_distances, //Output: minimum distance to each city
+    int* next_cities // Output: next city to visit
+) {
+    __shared__ double shared_min_distances[BLOCK_SIZE]; // Shared memory for minimum distances
+    __shared__ int shared_next_cities[BLOCK_SIZE]; // Shared memory for next cities
+    
+    int tid = threadIdx.x;
+    int gid = blockIdx.x * blockDim.x + threadIdx.x;
+    
+    // Load current city coordinates into shared memory
+    __shared__ double current_coords[2];
+    if (tid == 0) { // Only the first thread in the block loads the current city coordinates
+        current_coords[0] = coords_x[current_city];
+        current_coords[1] = coords_y[current_city];
+    }
+    __syncthreads(); // Ensures all threads wait until the coordinates are loaded
+    
+    double local_min = INFINITY;
+    int local_next = -1;
+    
+    // Each thread maintains its own minimum distance and corresponding city
+    for (int j = gid; j < n; j += blockDim.x * gridDim.x) { //blockDim.x * gridDim.x is the total number of threads
+        if (!visited[j]) { // If the city has not been visited
+            double dx = current_coords[0] - coords_x[j];
+            double dy = current_coords[1] - coords_y[j];
+            double dist = sqrt(dx * dx + dy * dy);
+            if (dist < local_min) { // If the distance is less than the local minimum
+                local_min = dist;
+                local_next = j;
+            }
+        }
+    }
+
+    // For example, if n = 1000, and we have 256 threads, then each thread will have to calculate 4 cities
+    
+    shared_min_distances[tid] = local_min;
+    shared_next_cities[tid] = local_next;
+    __syncthreads(); // Ensures all threads wait until the minimum distances and next cities are loaded
+    
+    // Block that performs parallel reduction to find the global minimum distance and corresponding city
+
+    // Explanation of what is parallel reduction:
+    /* In parallel reduction, we are trying to find the minimum distance and the corresponding city from the shared memory
+    array shared_min_distances. for example, if we have 256 threads, then we have 256 elements in shared_min_distances.
+    We are trying to find the minimum distance and the corresponding city from these 256 elements. 
+    the stride initially is 128, then 64, then 32, then 16, then 8, then 4, then 2, then 1.
+    so we are comparing the elements at index 0 and 128 (along with 1 and 129, 2 and 130 etc etc), then 0 and 64, then 0 and 32, then 0 and 16, then 0 and 8, then 0 and 4, then 0 and 2, then 0 and 1.
+    and we are updating the minimum distance and the corresponding city accordingly.
+    */
+    for (int stride = BLOCK_SIZE/2; stride > 0; stride >>= 1) {
+        if (tid < stride) {
+            if (shared_min_distances[tid + stride] < shared_min_distances[tid]) {
+                shared_min_distances[tid] = shared_min_distances[tid + stride];
+                shared_next_cities[tid] = shared_next_cities[tid + stride];
+            }
+        }
+        __syncthreads();
+    }
+    
+    // After the parallel reduction, the minimum distance and the corresponding city will be at index 0
+    if (tid == 0) {
+        min_distances[blockIdx.x] = shared_min_distances[0];
+        next_cities[blockIdx.x] = shared_next_cities[0];
+    }
+
+    /* Benefits of parallel reduction:
+    1. Reduces complexity from O(n) to O(log(n))
+    2. Reduces the number of global memory accesses
+    */
+}
+
+TSPResult solve(const std::vector<std::pair<double, double>>& coordinates) {
+    int n = coordinates.size();
+    
+    // Calculate optimal number of blocks
+    const int threadsPerBlock = BLOCK_SIZE;
+    const int numBlocks = (n + threadsPerBlock - 1) / threadsPerBlock;
+    
+    // Pinned memory: memory that is not swapped out to disk optimizes memory transfer between CPU and GPU
+    // Putting x and y coordinates into pinned memory
+    double *h_coords_x, *h_coords_y;
+    cudaMallocHost(&h_coords_x, n * sizeof(double));
+    cudaMallocHost(&h_coords_y, n * sizeof(double));
+    
+    for (int i = 0; i < n; i++) {
+        h_coords_x[i] = coordinates[i].first;
+        h_coords_y[i] = coordinates[i].second;
+    }
+    
+    /* Allocating GPU memory for coordaiantes.
+    Copying data from CPU to GPU*/
+    double *d_coords_x, *d_coords_y;
+    cudaMalloc(&d_coords_x, n * sizeof(double));
+    cudaMalloc(&d_coords_y, n * sizeof(double));
+    
+    // Copy coordinates to device
+    cudaMemcpy(d_coords_x, h_coords_x, n * sizeof(double), cudaMemcpyHostToDevice);
+    cudaMemcpy(d_coords_y, h_coords_y, n * sizeof(double), cudaMemcpyHostToDevice);
+    
+    // Initialize path variables
+    std::vector<int> path;
+    double total_cost = 0.0;
+    std::vector<char> visited(n, 0);
+    
+    // Allocate device memory for visited array and results
+    char* d_visited;
+    cudaMalloc(&d_visited, n * sizeof(char));
+    
+    double* d_min_distances;
+    int* d_next_cities;
+    cudaMalloc(&d_min_distances, numBlocks * sizeof(double));  // Changed from BLOCK_SIZE to numBlocks
+    cudaMalloc(&d_next_cities, numBlocks * sizeof(int));
+    
+    // Start from city 0
+    int current_city = 0;
+    path.push_back(current_city);
+    visited[current_city] = 1;
+    cudaMemcpy(d_visited, visited.data(), n * sizeof(char), cudaMemcpyHostToDevice);
+    
+    // Main loop
+    while (path.size() < n) {
+        // Launch kernel with calculated dimensions
+        findNearestCityKernel<<<numBlocks, threadsPerBlock>>>(
+            d_coords_x,
+            d_coords_y,
+            d_visited,
+            current_city,
+            n,
+            d_min_distances,
+            d_next_cities
+        );
+        
+        // Check for kernel execution errors
+        cudaError_t err = cudaGetLastError();
+        if (err != cudaSuccess) {
+            printf("Kernel launch error: %s\n", cudaGetErrorString(err));
+        }
+        
+        // Copy results back to host
+        std::vector<double> h_min_distances(numBlocks);  // Changed from BLOCK_SIZE to numBlocks
+        std::vector<int> h_next_cities(numBlocks);
+        cudaMemcpy(h_min_distances.data(), d_min_distances, numBlocks * sizeof(double), cudaMemcpyDeviceToHost);
+        cudaMemcpy(h_next_cities.data(), d_next_cities, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);
+        
+        // Find global minimum across all blocks
+        double min_distance = INFINITY;
+        int next_city = -1;
+        for (int i = 0; i < numBlocks; i++) {
+            if (h_min_distances[i] < min_distance) {
+                min_distance = h_min_distances[i];
+                next_city = h_next_cities[i];
+            }
+        }
+        
+        // Update path
+        current_city = next_city;
+        path.push_back(current_city);
+        visited[current_city] = 1;
+        cudaMemcpy(d_visited, visited.data(), n * sizeof(char), cudaMemcpyHostToDevice);
+        total_cost += min_distance;
+    }
+    
+    // Add return to start cost
+    double dx = h_coords_x[path.back()] - h_coords_x[path[0]];
+    double dy = h_coords_y[path.back()] - h_coords_y[path[0]];
+    total_cost += sqrt(dx * dx + dy * dy);
+    
+    // Cleaning up all the variables that we have used
+    cudaFreeHost(h_coords_x);
+    cudaFreeHost(h_coords_y);
+    cudaFree(d_coords_x);
+    cudaFree(d_coords_y);
+    cudaFree(d_visited);
+    cudaFree(d_min_distances);
+    cudaFree(d_next_cities);
+    
+    TSPResult result;
+    result.cost = total_cost;
+    result.path = path;
+    return result;
+}
\ No newline at end of file
diff --git a/utils/comparisons.py b/utils/comparisons.py
index 3032e20..b5a4973 100644
--- a/utils/comparisons.py
+++ b/utils/comparisons.py
@@ -3,50 +3,76 @@
 import time
 import argparse
 import matplotlib.pyplot as plt
+import numpy as np
 
 def run_implementation(exec_path: str, dataset: str) -> float:
-    """Run implementation with dataset and return execution time"""
     try:
         start = time.time()
         result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'], 
                               stdout=subprocess.PIPE,
                               stderr=subprocess.PIPE)
-        end = time.time()
-        return end - start
+        return time.time() - start
     except Exception as e:
         print(f"Error running {exec_path} with {dataset}: {e}")
         return -1
 
 def plot_results(results):
-    """Plot timing results for all implementations"""
-    plt.figure(figsize=(12, 8))
+    """Plot timing results with multiple views"""
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
     
     colors = {
         'brute': 'b',
         'greedy': 'g',
         'genetic': 'r',
         'dp': 'purple',
-        'greedy_omp': 'y'
+        'greedy_cuda': 'y'
     }
     
     datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
     x_pos = range(len(datasets))
     
+    # Plot 1: Log scale (good for small differences)
     for impl, timings in results.items():
-        if timings:  # Only plot if we have data
-            plt.plot(x_pos, [timings[d] for d in datasets], 
+        if timings:
+            ax1.plot(x_pos, [timings[d] for d in datasets], 
                     f'{colors[impl]}-o', 
-                    label=f'{impl.capitalize()} Implementation')
+                    label=f'{impl.capitalize()}')
+    ax1.set_yscale('log')
+    ax1.set_title('Log Scale Comparison')
+    ax1.set_xticks(x_pos)
+    ax1.set_xticklabels(datasets)
+    ax1.grid(True)
+    ax1.legend()
     
-    plt.xticks(x_pos, datasets)
-    plt.xlabel('Dataset Size')
-    plt.ylabel('Execution Time (seconds)')
-    plt.title('TSP Implementation Performance Comparison')
-    plt.legend()
-    plt.grid(True)
-    plt.yscale('log')  # Log scale for better visibility
+    # Plot 2: Linear scale (good for large differences)
+    for impl, timings in results.items():
+        if timings:
+            ax2.plot(x_pos, [timings[d] for d in datasets], 
+                    f'{colors[impl]}-o', 
+                    label=f'{impl.capitalize()}')
+    ax2.set_title('Linear Scale Comparison')
+    ax2.set_xticks(x_pos)
+    ax2.set_xticklabels(datasets)
+    ax2.grid(True)
+    
+    # Plot 3: Speedup relative to baseline (greedy)
+    if 'greedy' in results:
+        baseline = results['greedy']
+        for impl, timings in results.items():
+            if impl != 'greedy' and timings:
+                speedups = [baseline[d]/timings[d] for d in datasets]
+                ax3.plot(x_pos, speedups, 
+                        f'{colors[impl]}-o', 
+                        label=f'{impl.capitalize()}')
+        ax3.axhline(y=1.0, color='k', linestyle='--')
+        ax3.set_title('Speedup vs. Greedy')
+        ax3.set_xticks(x_pos)
+        ax3.set_xticklabels(datasets)
+        ax3.grid(True)
+        ax3.legend()
     
-    plt.savefig('comparison_results.png')
+    plt.tight_layout()
+    plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight')
     print("Plot saved as comparison_results.png")
     
     try:
@@ -55,7 +81,7 @@ def plot_results(results):
         print(f"Could not display plot: {e}")
 
 if __name__ == "__main__":
-    all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_omp']
+    all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda']
     
     parser = argparse.ArgumentParser(description='Compare TSP implementations')
     parser.add_argument('implementations', 
@@ -64,14 +90,10 @@ def plot_results(results):
                        help='Implementations to compare. If none specified, runs all.')
     
     args = parser.parse_args()
-    
-    # Select implementations to run
     implementations = args.implementations if args.implementations else all_implementations
     datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
     
     results = {}
-    
-    # Run tests for each implementation and dataset
     for impl in implementations:
         results[impl] = {}
         exec_path = f'./build/{impl}'
@@ -79,10 +101,8 @@ def plot_results(results):
         for dataset in datasets:
             print(f"Running {impl} on {dataset} dataset...")
             execution_time = run_implementation(exec_path, dataset)
-            
             if execution_time >= 0:
                 results[impl][dataset] = execution_time
                 print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds")
     
-    # Plot results
     plot_results(results)
\ No newline at end of file