-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
pushing dp-cuda and comparison util function
Ibrahim Fazili
committed
Dec 7, 2024
1 parent
7b9e9fe
commit fbda4e1
Showing
2 changed files
with
260 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
#include <bits/stdc++.h> | ||
#include <iostream> | ||
#include <vector> | ||
#include <cmath> | ||
#include <algorithm> // For std::next_permutation | ||
#include <limits> // For std::numeric_limits | ||
#include "../common/algorithms.hpp" // Include the common constants file | ||
|
||
using namespace std; | ||
|
||
#define MAX INT_MAX// Large value as infinity | ||
|
||
// Function to compute distance | ||
__host__ __device__ | ||
double compute_distance(const std::pair<double, double>& p1, const std::pair<double, double>& p2) { | ||
return std::sqrt((p1.first - p2.first) * (p1.first - p2.first) + | ||
(p1.second - p2.second) * (p1.second - p2.second)); | ||
} | ||
|
||
__global__ | ||
void tsp_kernel(double* dp, int* parent, const double* distances, int n, int subset_size) { | ||
__shared__ double shared_distances[32][32]; // Shared memory for distances | ||
int mask = blockIdx.x; // Each block handles one subset | ||
int u = threadIdx.x; // Each thread handles one node | ||
|
||
if (mask >= (1 << n) || __popc(mask) != subset_size || u >= n) return; | ||
|
||
// Load distances into shared memory | ||
if (u < n) { | ||
for (int v = 0; v < n; v++) { | ||
shared_distances[u][v] = distances[u * n + v]; | ||
} | ||
} | ||
__syncthreads(); | ||
|
||
if (!(mask & (1 << u))) return; | ||
|
||
int prev_mask = mask ^ (1 << u); | ||
double min_cost = MAX; | ||
int best_parent = -1; | ||
|
||
for (int v = 0; v < n; v++) { | ||
if (!(prev_mask & (1 << v))) continue; | ||
double new_cost = dp[prev_mask * n + v] + shared_distances[v][u]; | ||
if (new_cost < min_cost) { | ||
min_cost = new_cost; | ||
best_parent = v; | ||
} | ||
} | ||
|
||
dp[mask * n + u] = min_cost; | ||
parent[mask * n + u] = best_parent; | ||
} | ||
|
||
|
||
|
||
// Host function to solve TSP | ||
TSPResult solve(const std::vector<std::pair<double, double>>& coordinates) { | ||
int n = coordinates.size(); | ||
int full_mask = (1 << n) - 1; | ||
|
||
// Precompute distances | ||
std::vector<double> distances(n * n, 0.0); | ||
for (int i = 0; i < n; i++) { | ||
for (int j = 0; j < n; j++) { | ||
distances[i * n + j] = compute_distance(coordinates[i], coordinates[j]); | ||
} | ||
} | ||
|
||
// Allocate DP table and parent table | ||
size_t dp_size = (1 << n) * n * sizeof(double); | ||
size_t parent_size = (1 << n) * n * sizeof(int); | ||
std::vector<double> dp_host((1 << n) * n, MAX); | ||
std::vector<int> parent_host((1 << n) * n, -1); | ||
dp_host[(1 << 0) * n + 0] = 0; // Equivalent to dp[1][0] = 0 in Python | ||
|
||
double* dp_device; | ||
int* parent_device; | ||
double* distances_device; | ||
cudaMalloc(&dp_device, dp_size); | ||
cudaMalloc(&parent_device, parent_size); | ||
cudaMalloc(&distances_device, n * n * sizeof(double)); | ||
|
||
cudaMemcpy(dp_device, dp_host.data(), dp_size, cudaMemcpyHostToDevice); | ||
cudaMemcpy(parent_device, parent_host.data(), parent_size, cudaMemcpyHostToDevice); | ||
cudaMemcpy(distances_device, distances.data(), n * n * sizeof(double), cudaMemcpyHostToDevice); | ||
|
||
// Launch kernel for each subset size | ||
for (int subset_size = 2; subset_size <= n; subset_size++) { | ||
int num_blocks = (1 << n); // One block for each subset | ||
int num_threads = n; // One thread for each node | ||
tsp_kernel<<<num_blocks, num_threads>>>(dp_device, parent_device, distances_device, n, subset_size); | ||
cudaDeviceSynchronize(); | ||
} | ||
|
||
// Copy DP table and parent table back to host | ||
cudaMemcpy(dp_host.data(), dp_device, dp_size, cudaMemcpyDeviceToHost); | ||
cudaMemcpy(parent_host.data(), parent_device, parent_size, cudaMemcpyDeviceToHost); | ||
|
||
// printf("DP:\n"); | ||
// for (int mask = 0; mask < (1 << n); mask++) { // Iterate through all subsets (2^n subsets) | ||
// for (int u = 0; u < n; u++) { // Iterate through each node in the subset | ||
// printf("%f ", dp_host[mask * n + u]); | ||
// } | ||
// printf("\n"); | ||
// } | ||
|
||
// Find optimal cost and last node | ||
double opt = MAX; | ||
int last_node = -1; | ||
for (int k = 1; k < n; k++) { | ||
double new_cost = dp_host[full_mask * n + k] + distances[k * n]; | ||
if (new_cost < opt) { | ||
opt = new_cost; | ||
last_node = k; | ||
} | ||
} | ||
|
||
// Reconstruct the path | ||
std::vector<int> path; | ||
int current_mask = full_mask; | ||
int current_node = last_node; | ||
while (current_node != -1) { | ||
path.push_back(current_node); | ||
int temp = current_node; | ||
current_node = parent_host[current_mask * n + current_node]; | ||
current_mask ^= (1 << temp); | ||
} | ||
// path.push_back(0); // Add the starting node | ||
std::reverse(path.begin(), path.end()); | ||
|
||
// Free GPU memory | ||
cudaFree(dp_device); | ||
cudaFree(parent_device); | ||
cudaFree(distances_device); | ||
|
||
// Return the result | ||
return TSPResult{opt, path}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,121 @@ | ||
#!/usr/bin/env python3 | ||
import subprocess | ||
import time | ||
import argparse | ||
import matplotlib.pyplot as plt | ||
import numpy as np | ||
|
||
def run_implementation(exec_path: str, dataset: str) -> float: | ||
try: | ||
start = time.time() | ||
result = subprocess.run([exec_path, '--csv', f'data/{dataset}.csv'], | ||
stdout=subprocess.PIPE, | ||
stderr=subprocess.PIPE) | ||
return time.time() - start | ||
except Exception as e: | ||
print(f"Error running {exec_path} with {dataset}: {e}") | ||
return -1 | ||
|
||
def plot_results(results): | ||
"""Plot timing results with multiple views""" | ||
fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(20, 6)) | ||
|
||
colors = { | ||
'brute': 'b', | ||
'greedy': 'g', | ||
'genetic': 'r', | ||
'dp': 'c', | ||
'greedy_cuda': 'y' | ||
} | ||
|
||
datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] | ||
x_pos = range(len(datasets)) | ||
|
||
# Plot 1: Log scale (good for small differences) | ||
for impl, timings in results.items(): | ||
if timings: | ||
filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation | ||
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices | ||
ax1.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], | ||
f'{colors[impl]}-o', | ||
label=f'{impl.capitalize()}') | ||
ax1.set_yscale('log') | ||
ax1.set_title('Log Scale Comparison') | ||
ax1.set_xticks(x_pos) | ||
ax1.set_xticklabels(datasets) | ||
ax1.grid(True) | ||
ax1.legend() | ||
|
||
# Plot 2: Linear scale (good for large differences) | ||
for impl, timings in results.items(): | ||
if timings: | ||
filtered_datasets = [d for d in datasets if d in timings] # Filter valid datasets for this implementation | ||
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices | ||
ax2.plot(x_pos_filtered, [timings[d] for d in filtered_datasets], | ||
f'{colors[impl]}-o', | ||
label=f'{impl.capitalize()}') | ||
ax2.set_title('Linear Scale Comparison') | ||
ax2.set_xticks(x_pos) | ||
ax2.set_xticklabels(datasets) | ||
ax2.grid(True) | ||
|
||
# Plot 3: Speedup relative to baseline (greedy) | ||
if 'greedy' in results: | ||
baseline = results['greedy'] | ||
for impl, timings in results.items(): | ||
if impl != 'greedy' and timings: | ||
filtered_datasets = [d for d in datasets if d in timings and d in baseline] # Filter valid datasets | ||
x_pos_filtered = [datasets.index(d) for d in filtered_datasets] # Map filtered datasets to x_pos indices | ||
speedups = [baseline[d] / timings[d] for d in filtered_datasets] | ||
ax3.plot(x_pos_filtered, speedups, | ||
f'{colors[impl]}-o', | ||
label=f'{impl.capitalize()}') | ||
ax3.axhline(y=1.0, color='k', linestyle='--') | ||
ax3.set_title('Speedup vs. Greedy') | ||
ax3.set_xticks(x_pos) | ||
ax3.set_xticklabels(datasets) | ||
ax3.grid(True) | ||
ax3.legend() | ||
|
||
plt.tight_layout() | ||
plt.savefig('comparison_results.png', dpi=300, bbox_inches='tight') | ||
print("Plot saved as comparison_results.png") | ||
|
||
try: | ||
plt.show() | ||
except Exception as e: | ||
print(f"Could not display plot: {e}") | ||
|
||
if __name__ == "__main__": | ||
all_implementations = ['brute', 'greedy', 'genetic', 'dp', 'greedy_cuda'] | ||
|
||
parser = argparse.ArgumentParser(description='Compare TSP implementations') | ||
parser.add_argument('implementations', | ||
nargs='*', | ||
choices=all_implementations, | ||
help='Implementations to compare. If none specified, runs all.') | ||
|
||
args = parser.parse_args() | ||
implementations = args.implementations if args.implementations else all_implementations | ||
datasets = ['tiny', 'small', 'medium', 'large','huge','gigantic'] | ||
cutoff = { | ||
'brute': 'small', | ||
'dp': 'medium', | ||
} | ||
|
||
results = {} | ||
for impl in implementations: | ||
results[impl] = {} | ||
exec_path = f'./build/{impl}' | ||
|
||
for dataset in datasets: | ||
cutoff_point = cutoff.get(impl, None) | ||
if dataset == cutoff_point: | ||
break | ||
print(f"Running {impl} on {dataset} dataset...") | ||
execution_time = run_implementation(exec_path, dataset) | ||
if execution_time >= 0: | ||
results[impl][dataset] = execution_time | ||
print(f"{impl.capitalize()}, {dataset}: {execution_time:.6f} seconds") | ||
|
||
plot_results(results) |