diff --git a/algorithms/dp-cuda.cu b/algorithms/dp-cuda.cu new file mode 100644 index 0000000..4253d5c --- /dev/null +++ b/algorithms/dp-cuda.cu @@ -0,0 +1,139 @@ +#include +#include +#include +#include +#include // For std::next_permutation +#include // For std::numeric_limits +#include "../common/algorithms.hpp" // Include the common constants file + +using namespace std; + +#define MAX INT_MAX// Large value as infinity + +// Function to compute distance +__host__ __device__ +double compute_distance(const std::pair& p1, const std::pair& p2) { + return std::sqrt((p1.first - p2.first) * (p1.first - p2.first) + + (p1.second - p2.second) * (p1.second - p2.second)); +} + +__global__ +void tsp_kernel(double* dp, int* parent, const double* distances, int n, int subset_size) { + __shared__ double shared_distances[32][32]; // Shared memory for distances + int mask = blockIdx.x; // Each block handles one subset + int u = threadIdx.x; // Each thread handles one node + + if (mask >= (1 << n) || __popc(mask) != subset_size || u >= n) return; + + // Load distances into shared memory + if (u < n) { + for (int v = 0; v < n; v++) { + shared_distances[u][v] = distances[u * n + v]; + } + } + __syncthreads(); + + if (!(mask & (1 << u))) return; + + int prev_mask = mask ^ (1 << u); + double min_cost = MAX; + int best_parent = -1; + + for (int v = 0; v < n; v++) { + if (!(prev_mask & (1 << v))) continue; + double new_cost = dp[prev_mask * n + v] + shared_distances[v][u]; + if (new_cost < min_cost) { + min_cost = new_cost; + best_parent = v; + } + } + + dp[mask * n + u] = min_cost; + parent[mask * n + u] = best_parent; +} + + + +// Host function to solve TSP +TSPResult solve(const std::vector>& coordinates) { + int n = coordinates.size(); + int full_mask = (1 << n) - 1; + + // Precompute distances + std::vector distances(n * n, 0.0); + for (int i = 0; i < n; i++) { + for (int j = 0; j < n; j++) { + distances[i * n + j] = compute_distance(coordinates[i], coordinates[j]); + } + } + + // Allocate DP table and parent table + size_t dp_size = (1 << n) * n * sizeof(double); + size_t parent_size = (1 << n) * n * sizeof(int); + std::vector dp_host((1 << n) * n, MAX); + std::vector parent_host((1 << n) * n, -1); + dp_host[(1 << 0) * n + 0] = 0; // Equivalent to dp[1][0] = 0 in Python + + double* dp_device; + int* parent_device; + double* distances_device; + cudaMalloc(&dp_device, dp_size); + cudaMalloc(&parent_device, parent_size); + cudaMalloc(&distances_device, n * n * sizeof(double)); + + cudaMemcpy(dp_device, dp_host.data(), dp_size, cudaMemcpyHostToDevice); + cudaMemcpy(parent_device, parent_host.data(), parent_size, cudaMemcpyHostToDevice); + cudaMemcpy(distances_device, distances.data(), n * n * sizeof(double), cudaMemcpyHostToDevice); + + // Launch kernel for each subset size + for (int subset_size = 2; subset_size <= n; subset_size++) { + int num_blocks = (1 << n); // One block for each subset + int num_threads = n; // One thread for each node + tsp_kernel<<>>(dp_device, parent_device, distances_device, n, subset_size); + cudaDeviceSynchronize(); + } + + // Copy DP table and parent table back to host + cudaMemcpy(dp_host.data(), dp_device, dp_size, cudaMemcpyDeviceToHost); + cudaMemcpy(parent_host.data(), parent_device, parent_size, cudaMemcpyDeviceToHost); + + // printf("DP:\n"); + // for (int mask = 0; mask < (1 << n); mask++) { // Iterate through all subsets (2^n subsets) + // for (int u = 0; u < n; u++) { // Iterate through each node in the subset + // printf("%f ", dp_host[mask * n + u]); + // } + // printf("\n"); + // } + + // Find optimal cost and last node + double opt = MAX; + int last_node = -1; + for (int k = 1; k < n; k++) { + double new_cost = dp_host[full_mask * n + k] + distances[k * n]; + if (new_cost < opt) { + opt = new_cost; + last_node = k; + } + } + + // Reconstruct the path + std::vector path; + int current_mask = full_mask; + int current_node = last_node; + while (current_node != -1) { + path.push_back(current_node); + int temp = current_node; + current_node = parent_host[current_mask * n + current_node]; + current_mask ^= (1 << temp); + } + // path.push_back(0); // Add the starting node + std::reverse(path.begin(), path.end()); + + // Free GPU memory + cudaFree(dp_device); + cudaFree(parent_device); + cudaFree(distances_device); + + // Return the result + return TSPResult{opt, path}; +} \ No newline at end of file diff --git a/utils/comparisons_all.py b/utils/comparisons_all.py new file mode 100644 index 0000000..5c8c2d4 --- /dev/null +++ b/utils/comparisons_all.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 +import subprocess +import time +import argparse +import matplotlib.pyplot as plt +import numpy as np + +def run_implementation(exec_path: str, dataset: str) -> float: + try: + start = time.time() + result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + return time.time() - start + except Exception as e: + print(f"Error running {exec_path} with {dataset}: {e}") + return -1 + +def plot_results(results): + """Plot timing results with multiple views""" + fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) + + colors = { + 'brute': 'b', + 'greedy': 'g', + 'genetic': 'r', + 'dp': 'c', + 'greedy_cuda': 'y' + } + + datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] + x_pos = range(len(datasets)) + + # Plot 1: Log scale (good for small differences) + for impl, timings in results.items(): + if timings: + filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation + x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices + ax1.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], + f'{colors[impl]}-o', + label=f'{impl.capitalize()}') + ax1.set_yscale('log') + ax1.set_title('Log Scale Comparison') + ax1.set_xticks(x_pos) + ax1.set_xticklabels(datasets) + ax1.grid(True) + ax1.legend() + + # Plot 2: Linear scale (good for large differences) + for impl, timings in results.items(): + if timings: + filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation + x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices + ax2.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], + f'{colors[impl]}-o', + label=f'{impl.capitalize()}') + ax2.set_title('Linear Scale Comparison') + ax2.set_xticks(x_pos) + ax2.set_xticklabels(datasets) + ax2.grid(True) + + # Plot 3: Speedup relative to baseline (greedy) + if 'greedy' in results: + baseline = results['greedy'] + for impl, timings in results.items(): + if impl != 'greedy' and timings: + filtered_datasets = [d for d in datasets if d in timings and d in baseline] # Filter valid datasets + x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices + speedups = [baseline[d] / timings[d] for d in filtered_datasets] + ax3.plot(x_pos_filtered, speedups, + f'{colors[impl]}-o', + label=f'{impl.capitalize()}') + ax3.axhline(y=1.0, color='k', linestyle='--') + ax3.set_title('Speedup vs. Greedy') + ax3.set_xticks(x_pos) + ax3.set_xticklabels(datasets) + ax3.grid(True) + ax3.legend() + + plt.tight_layout() + plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight') + print("Plot saved as comparison_results.png") + + try: + plt.show() + except Exception as e: + print(f"Could not display plot: {e}") + +if __name__ == "__main__": + all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda'] + + parser = argparse.ArgumentParser(description='Compare TSP implementations') + parser.add_argument('implementations', + nargs='*', + choices=all_implementations, + help='Implementations to compare. If none specified, runs all.') + + args = parser.parse_args() + implementations = args.implementations if args.implementations else all_implementations + datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] + cutoff = { + 'brute': 'small', + 'dp': 'medium', + } + + results = {} + for impl in implementations: + results[impl] = {} + exec_path = f'./build/{impl}' + + for dataset in datasets: + cutoff_point = cutoff.get(impl, None) + if dataset == cutoff_point: + break + print(f"Running {impl} on {dataset} dataset...") + execution_time = run_implementation(exec_path, dataset) + if execution_time >= 0: + results[impl][dataset] = execution_time + print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds") + + plot_results(results) \ No newline at end of file