Skip to content

Commit

Permalink
pushing dp-cuda and comparison util function
Browse files Browse the repository at this point in the history
Ibrahim Fazili committed Dec 7, 2024
1 parent 7b9e9fe commit fbda4e1
Showing 2 changed files with 260 additions and 0 deletions.
139 changes: 139 additions & 0 deletions algorithms/dp-cuda.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
#include <bits/stdc++.h>
#include <iostream>
#include <vector>
#include <cmath>
#include <algorithm> // For std::next_permutation
#include <limits> // For std::numeric_limits
#include "../common/algorithms.hpp" // Include the common constants file

using namespace std;

#define MAX INT_MAX// Large value as infinity

// Function to compute distance
__host__ __device__
double compute_distance(const std::pair<double, double>& p1, const std::pair<double, double>& p2) {
return std::sqrt((p1.first - p2.first) * (p1.first - p2.first) +
(p1.second - p2.second) * (p1.second - p2.second));
}

__global__
void tsp_kernel(double* dp, int* parent, const double* distances, int n, int subset_size) {
__shared__ double shared_distances[32][32]; // Shared memory for distances
int mask = blockIdx.x; // Each block handles one subset
int u = threadIdx.x; // Each thread handles one node

if (mask >= (1 << n) || __popc(mask) != subset_size || u >= n) return;

// Load distances into shared memory
if (u < n) {
for (int v = 0; v < n; v++) {
shared_distances[u][v] = distances[u * n + v];
}
}
__syncthreads();

if (!(mask & (1 << u))) return;

int prev_mask = mask ^ (1 << u);
double min_cost = MAX;
int best_parent = -1;

for (int v = 0; v < n; v++) {
if (!(prev_mask & (1 << v))) continue;
double new_cost = dp[prev_mask * n + v] + shared_distances[v][u];
if (new_cost < min_cost) {
min_cost = new_cost;
best_parent = v;
}
}

dp[mask * n + u] = min_cost;
parent[mask * n + u] = best_parent;
}



// Host function to solve TSP
TSPResult solve(const std::vector<std::pair<double, double>>& coordinates) {
int n = coordinates.size();
int full_mask = (1 << n) - 1;

// Precompute distances
std::vector<double> distances(n * n, 0.0);
for (int i = 0; i < n; i++) {
for (int j = 0; j < n; j++) {
distances[i * n + j] = compute_distance(coordinates[i], coordinates[j]);
}
}

// Allocate DP table and parent table
size_t dp_size = (1 << n) * n * sizeof(double);
size_t parent_size = (1 << n) * n * sizeof(int);
std::vector<double> dp_host((1 << n) * n, MAX);
std::vector<int> parent_host((1 << n) * n, -1);
dp_host[(1 << 0) * n + 0] = 0; // Equivalent to dp[1][0] = 0 in Python

double* dp_device;
int* parent_device;
double* distances_device;
cudaMalloc(&dp_device, dp_size);
cudaMalloc(&parent_device, parent_size);
cudaMalloc(&distances_device, n * n * sizeof(double));

cudaMemcpy(dp_device, dp_host.data(), dp_size, cudaMemcpyHostToDevice);
cudaMemcpy(parent_device, parent_host.data(), parent_size, cudaMemcpyHostToDevice);
cudaMemcpy(distances_device, distances.data(), n * n * sizeof(double), cudaMemcpyHostToDevice);

// Launch kernel for each subset size
for (int subset_size = 2; subset_size <= n; subset_size++) {
int num_blocks = (1 << n); // One block for each subset
int num_threads = n; // One thread for each node
tsp_kernel<<<num_blocks, num_threads>>>(dp_device, parent_device, distances_device, n, subset_size);
cudaDeviceSynchronize();
}

// Copy DP table and parent table back to host
cudaMemcpy(dp_host.data(), dp_device, dp_size, cudaMemcpyDeviceToHost);
cudaMemcpy(parent_host.data(), parent_device, parent_size, cudaMemcpyDeviceToHost);

// printf("DP:\n");
// for (int mask = 0; mask < (1 << n); mask++) { // Iterate through all subsets (2^n subsets)
// for (int u = 0; u < n; u++) { // Iterate through each node in the subset
// printf("%f ", dp_host[mask * n + u]);
// }
// printf("\n");
// }

// Find optimal cost and last node
double opt = MAX;
int last_node = -1;
for (int k = 1; k < n; k++) {
double new_cost = dp_host[full_mask * n + k] + distances[k * n];
if (new_cost < opt) {
opt = new_cost;
last_node = k;
}
}

// Reconstruct the path
std::vector<int> path;
int current_mask = full_mask;
int current_node = last_node;
while (current_node != -1) {
path.push_back(current_node);
int temp = current_node;
current_node = parent_host[current_mask * n + current_node];
current_mask ^= (1 << temp);
}
// path.push_back(0); // Add the starting node
std::reverse(path.begin(), path.end());

// Free GPU memory
cudaFree(dp_device);
cudaFree(parent_device);
cudaFree(distances_device);

// Return the result
return TSPResult{opt, path};
}
121 changes: 121 additions & 0 deletions utils/comparisons_all.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
#!/usr/bin/env python3
import subprocess
import time
import argparse
import matplotlib.pyplot as plt
import numpy as np

def run_implementation(exec_path: str, dataset: str) -> float:
try:
start = time.time()
result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE)
return time.time() - start
except Exception as e:
print(f"Error running {exec_path} with {dataset}: {e}")
return -1

def plot_results(results):
"""Plot timing results with multiple views"""
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6))

colors = {
'brute': 'b',
'greedy': 'g',
'genetic': 'r',
'dp': 'c',
'greedy_cuda': 'y'
}

datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
x_pos = range(len(datasets))

# Plot 1: Log scale (good for small differences)
for impl, timings in results.items():
if timings:
filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices
ax1.plot(x_pos_filtered, [timings[d] for d in filtered_datasets],
f'{colors[impl]}-o',
label=f'{impl.capitalize()}')
ax1.set_yscale('log')
ax1.set_title('Log Scale Comparison')
ax1.set_xticks(x_pos)
ax1.set_xticklabels(datasets)
ax1.grid(True)
ax1.legend()

# Plot 2: Linear scale (good for large differences)
for impl, timings in results.items():
if timings:
filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices
ax2.plot(x_pos_filtered, [timings[d] for d in filtered_datasets],
f'{colors[impl]}-o',
label=f'{impl.capitalize()}')
ax2.set_title('Linear Scale Comparison')
ax2.set_xticks(x_pos)
ax2.set_xticklabels(datasets)
ax2.grid(True)

# Plot 3: Speedup relative to baseline (greedy)
if 'greedy' in results:
baseline = results['greedy']
for impl, timings in results.items():
if impl != 'greedy' and timings:
filtered_datasets = [d for d in datasets if d in timings and d in baseline] # Filter valid datasets
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices
speedups = [baseline[d] / timings[d] for d in filtered_datasets]
ax3.plot(x_pos_filtered, speedups,
f'{colors[impl]}-o',
label=f'{impl.capitalize()}')
ax3.axhline(y=1.0, color='k', linestyle='--')
ax3.set_title('Speedup vs. Greedy')
ax3.set_xticks(x_pos)
ax3.set_xticklabels(datasets)
ax3.grid(True)
ax3.legend()

plt.tight_layout()
plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight')
print("Plot saved as comparison_results.png")

try:
plt.show()
except Exception as e:
print(f"Could not display plot: {e}")

if __name__ == "__main__":
all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda']

parser = argparse.ArgumentParser(description='Compare TSP implementations')
parser.add_argument('implementations',
nargs='*',
choices=all_implementations,
help='Implementations to compare. If none specified, runs all.')

args = parser.parse_args()
implementations = args.implementations if args.implementations else all_implementations
datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic']
cutoff = {
'brute': 'small',
'dp': 'medium',
}

results = {}
for impl in implementations:
results[impl] = {}
exec_path = f'./build/{impl}'

for dataset in datasets:
cutoff_point = cutoff.get(impl, None)
if dataset == cutoff_point:
break
print(f"Running {impl} on {dataset} dataset...")
execution_time = run_implementation(exec_path, dataset)
if execution_time >= 0:
results[impl][dataset] = execution_time
print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds")

plot_results(results)

0 comments on commit fbda4e1

Please sign in to comment.