pushing dp-cuda and comparison util function

asmith236 · Dec 7, 2024 · fbda4e1 · fbda4e1
1 parent 7b9e9fe
commit fbda4e1
Showing 2 changed files with 260 additions and 0 deletions.
diff --git a/algorithms/dp-cuda.cu b/algorithms/dp-cuda.cu
@@ -0,0 +1,139 @@
+#include <bits/stdc++.h>
+#include <iostream>
+#include <vector>
+#include <cmath>
+#include <algorithm> // For std::next_permutation
+#include <limits>    // For std::numeric_limits
+#include "../common/algorithms.hpp" // Include the common constants file
+
+using namespace std;
+
+#define MAX INT_MAX// Large value as infinity
+
+// Function to compute distance
+__host__ __device__
+double compute_distance(const std::pair<double, double>& p1, const std::pair<double, double>& p2) {
+    return std::sqrt((p1.first - p2.first) * (p1.first - p2.first) + 
+                     (p1.second - p2.second) * (p1.second - p2.second));
+}
+
+__global__
+void tsp_kernel(double* dp, int* parent, const double* distances, int n, int subset_size) {
+    __shared__ double shared_distances[32][32]; // Shared memory for distances
+    int mask = blockIdx.x;                     // Each block handles one subset
+    int u = threadIdx.x;                       // Each thread handles one node
+
+    if (mask >= (1 << n) || __popc(mask) != subset_size || u >= n) return;
+
+    // Load distances into shared memory
+    if (u < n) {
+        for (int v = 0; v < n; v++) {
+            shared_distances[u][v] = distances[u * n + v];
+        }
+    }
+    __syncthreads();
+
+    if (!(mask & (1 << u))) return;
+
+    int prev_mask = mask ^ (1 << u);
+    double min_cost = MAX;
+    int best_parent = -1;
+
+    for (int v = 0; v < n; v++) {
+        if (!(prev_mask & (1 << v))) continue;
+        double new_cost = dp[prev_mask * n + v] + shared_distances[v][u];
+        if (new_cost < min_cost) {
+            min_cost = new_cost;
+            best_parent = v;
+        }
+    }
+
+    dp[mask * n + u] = min_cost;
+    parent[mask * n + u] = best_parent;
+}
+
+
+
+// Host function to solve TSP
+TSPResult solve(const std::vector<std::pair<double, double>>& coordinates) {
+    int n = coordinates.size();
+    int full_mask = (1 << n) - 1;
+
+    // Precompute distances
+    std::vector<double> distances(n * n, 0.0);
+    for (int i = 0; i < n; i++) {
+        for (int j = 0; j < n; j++) {
+            distances[i * n + j] = compute_distance(coordinates[i], coordinates[j]);
+        }
+    }
+
+    // Allocate DP table and parent table
+    size_t dp_size = (1 << n) * n * sizeof(double);
+    size_t parent_size = (1 << n) * n * sizeof(int);
+    std::vector<double> dp_host((1 << n) * n, MAX);
+    std::vector<int> parent_host((1 << n) * n, -1);
+    dp_host[(1 << 0) * n + 0] = 0; // Equivalent to dp[1][0] = 0 in Python
+
+    double* dp_device;
+    int* parent_device;
+    double* distances_device;
+    cudaMalloc(&dp_device, dp_size);
+    cudaMalloc(&parent_device, parent_size);
+    cudaMalloc(&distances_device, n * n * sizeof(double));
+
+    cudaMemcpy(dp_device, dp_host.data(), dp_size, cudaMemcpyHostToDevice);
+    cudaMemcpy(parent_device, parent_host.data(), parent_size, cudaMemcpyHostToDevice);
+    cudaMemcpy(distances_device, distances.data(), n * n * sizeof(double), cudaMemcpyHostToDevice);
+
+    // Launch kernel for each subset size
+    for (int subset_size = 2; subset_size <= n; subset_size++) {
+        int num_blocks = (1 << n);  // One block for each subset
+        int num_threads = n;       // One thread for each node
+        tsp_kernel<<<num_blocks, num_threads>>>(dp_device, parent_device, distances_device, n, subset_size);
+        cudaDeviceSynchronize();
+    }
+
+    // Copy DP table and parent table back to host
+    cudaMemcpy(dp_host.data(), dp_device, dp_size, cudaMemcpyDeviceToHost);
+    cudaMemcpy(parent_host.data(), parent_device, parent_size, cudaMemcpyDeviceToHost);
+
+    // printf("DP:\n");
+    // for (int mask = 0; mask < (1 << n); mask++) {  // Iterate through all subsets (2^n subsets)
+    //     for (int u = 0; u < n; u++) {  // Iterate through each node in the subset
+    //         printf("%f ", dp_host[mask * n + u]);
+    //     }
+    //     printf("\n");
+    // }
+
+    // Find optimal cost and last node
+    double opt = MAX;
+    int last_node = -1;
+    for (int k = 1; k < n; k++) {
+        double new_cost = dp_host[full_mask * n + k] + distances[k * n];
+        if (new_cost < opt) {
+            opt = new_cost;
+            last_node = k;
+        }
+    }
+
+    // Reconstruct the path
+    std::vector<int> path;
+    int current_mask = full_mask;
+    int current_node = last_node;
+    while (current_node != -1) {
+        path.push_back(current_node);
+        int temp = current_node;
+        current_node = parent_host[current_mask * n + current_node];
+        current_mask ^= (1 << temp);
+    }
+    // path.push_back(0); // Add the starting node
+    std::reverse(path.begin(), path.end());
+
+    // Free GPU memory
+    cudaFree(dp_device);
+    cudaFree(parent_device);
+    cudaFree(distances_device);
+
+    // Return the result
+    return TSPResult{opt, path};
+}
diff --git a/utils/comparisons_all.py b/utils/comparisons_all.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+import subprocess
+import time
+import argparse
+import matplotlib.pyplot as plt
+import numpy as np
+
+def run_implementation(exec_path: str, dataset: str) -> float:
+    try:
+        start = time.time()
+        result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'], 
+                              stdout=subprocess.PIPE,
+                              stderr=subprocess.PIPE)
+        return time.time() - start
+    except Exception as e:
+        print(f"Error running {exec_path} with {dataset}: {e}")
+        return -1
+
+def plot_results(results):
+    """Plot timing results with multiple views"""
+    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))
+
+    colors = {
+        'brute': 'b',
+        'greedy': 'g',
+        'genetic': 'r',
+        'dp': 'c',
+        'greedy_cuda': 'y'
+    }
+
+    datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
+    x_pos = range(len(datasets))
+
+    # Plot 1: Log scale (good for small differences)
+    for impl, timings in results.items():
+        if timings:
+            filtered_datasets = [d for d in datasets if d in timings]  # Filter valid datasets for this implementation
+            x_pos_filtered = [datasets.index(d) for d in filtered_datasets]  # Map filtered datasets to x_pos indices
+            ax1.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], 
+                    f'{colors[impl]}-o', 
+                    label=f'{impl.capitalize()}')
+    ax1.set_yscale('log')
+    ax1.set_title('Log Scale Comparison')
+    ax1.set_xticks(x_pos)
+    ax1.set_xticklabels(datasets)
+    ax1.grid(True)
+    ax1.legend()
+
+    # Plot 2: Linear scale (good for large differences)
+    for impl, timings in results.items():
+        if timings:
+            filtered_datasets = [d for d in datasets if d in timings]  # Filter valid datasets for this implementation
+            x_pos_filtered = [datasets.index(d) for d in filtered_datasets]  # Map filtered datasets to x_pos indices
+            ax2.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], 
+                    f'{colors[impl]}-o', 
+                    label=f'{impl.capitalize()}')
+    ax2.set_title('Linear Scale Comparison')
+    ax2.set_xticks(x_pos)
+    ax2.set_xticklabels(datasets)
+    ax2.grid(True)
+
+    # Plot 3: Speedup relative to baseline (greedy)
+    if 'greedy' in results:
+        baseline = results['greedy']
+        for impl, timings in results.items():
+            if impl != 'greedy' and timings:
+                filtered_datasets = [d for d in datasets if d in timings and d in baseline]  # Filter valid datasets
+                x_pos_filtered = [datasets.index(d) for d in filtered_datasets]  # Map filtered datasets to x_pos indices
+                speedups = [baseline[d] / timings[d] for d in filtered_datasets]
+                ax3.plot(x_pos_filtered, speedups, 
+                        f'{colors[impl]}-o', 
+                        label=f'{impl.capitalize()}')
+        ax3.axhline(y=1.0, color='k', linestyle='--')
+        ax3.set_title('Speedup vs. Greedy')
+        ax3.set_xticks(x_pos)
+        ax3.set_xticklabels(datasets)
+        ax3.grid(True)
+        ax3.legend()
+
+    plt.tight_layout()
+    plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight')
+    print("Plot saved as comparison_results.png")
+
+    try:
+        plt.show()
+    except Exception as e:
+        print(f"Could not display plot: {e}")
+
+if __name__ == "__main__":
+    all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda']
+
+    parser = argparse.ArgumentParser(description='Compare TSP implementations')
+    parser.add_argument('implementations', 
+                       nargs='*',
+                       choices=all_implementations,
+                       help='Implementations to compare. If none specified, runs all.')
+
+    args = parser.parse_args()
+    implementations = args.implementations if args.implementations else all_implementations
+    datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
+    cutoff = {
+        'brute': 'small',
+        'dp': 'medium',
+    }
+
+    results = {}
+    for impl in implementations:
+        results[impl] = {}
+        exec_path = f'./build/{impl}'
+
+        for dataset in datasets:
+            cutoff_point = cutoff.get(impl, None)
+            if dataset == cutoff_point:
+                break
+            print(f"Running {impl} on {dataset} dataset...")
+            execution_time = run_implementation(exec_path, dataset)
+            if execution_time >= 0:
+                results[impl][dataset] = execution_time
+                print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds")
+
+    plot_results(results)