update profiling utils and naming

asmith236 · Dec 12, 2024 · 4662e10 · 4662e10
1 parent 991075c
commit 4662e10
Show file tree

Hide file tree

Showing 7 changed files with 308 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .vscode/
 build/
 *.ipynb
-*.png
+*.png
+vtune/
diff --git a/Makefile b/Makefile
@@ -13,7 +13,7 @@ SRC_DIR=common
 ALGO_DIR=algorithms
 BUILD_DIR=build
 
-all: brute dp genetic greedy genetic_cuda dp_omp dp_cuda greedy_cuda dp_numa
+all: brute dp genetic greedy genetic_cuda dp_omp dp_cuda greedy_cuda dp_omp_numa
 
 brute: $(BUILD_DIR)/brute
 dp: $(BUILD_DIR)/dp
@@ -23,7 +23,7 @@ genetic_cuda: $(BUILD_DIR)/genetic_cuda
 dp_omp: $(BUILD_DIR)/dp_omp
 greedy_cuda: $(BUILD_DIR)/greedy_cuda
 dp_cuda: $(BUILD_DIR)/dp_cuda
-dp_numa: $(BUILD_DIR)/dp_numa
+dp_omp_numa: $(BUILD_DIR)/dp_omp_numa
 
 $(BUILD_DIR)/brute: $(SRC_DIR)/main.cpp $(ALGO_DIR)/brute.cpp
 	$(CPP) $^ -o $@ $(CFLAGS) $(OPTFLAGS)
@@ -49,7 +49,7 @@ $(BUILD_DIR)/greedy_cuda: $(SRC_DIR)/main.cpp $(ALGO_DIR)/greedy_cuda.cu
 $(BUILD_DIR)/dp_cuda: $(SRC_DIR)/main.cpp $(ALGO_DIR)/dp_cuda.cu
 	$(NVCC) $^ -o $@ $(NVCCFLAGS)
 
-$(BUILD_DIR)/dp_numa: $(SRC_DIR)/main.cpp $(ALGO_DIR)/dp_numa.cpp
+$(BUILD_DIR)/dp_omp_numa: $(SRC_DIR)/main.cpp $(ALGO_DIR)/dp_omp_numa.cpp
 	$(CPP) $^ -o $@ $(CFLAGS_NUMA) $(OPTFLAGS_DP)
 
 .PHONY: clean

diff --git a/algorithms/dp_omp.cpp b/algorithms/dp_omp.cpp
@@ -1,4 +1,3 @@
-// 0m40.416s
 #include <bits/stdc++.h>
 #include <omp.h> // Include OpenMP header for parallelization
 #include <vector>

diff --git a/algorithms/dp_numa.cpp → algorithms/dp_omp_numa.cpp b/algorithms/dp_numa.cpp → algorithms/dp_omp_numa.cpp
diff --git a/data/small_25.csv b/data/small_25.csv
@@ -0,0 +1,25 @@
+4.179924564742493187e-01,3.441119252755281410e-01
+5.893815561138715520e-02,6.795302883652539760e-01
+4.784590047029411153e-01,5.381499589254157279e-02
+5.211594296883900190e-01,8.066037608804467185e-01
+5.806320199484605871e-01,6.029501879313505786e-01
+3.091989979348186424e-01,6.552131835038437213e-01
+9.198826252545819759e-01,7.476597889282088349e-01
+6.553475030164537252e-01,5.356261761284698553e-01
+3.492413849207449683e-01,1.091564818399735071e-01
+5.410940445399985421e-01,4.961962866529807670e-01
+4.490533956886093581e-01,1.723426276209378827e-01
+2.823209606253457293e-01,6.839891964077760012e-01
+2.958765252205050045e-01,5.298252112514997281e-01
+5.634809826131318422e-01,6.252242659304493300e-01
+7.151912790861654212e-01,3.151430227027559594e-01
+5.176186770719936048e-01,4.915106394415269531e-01
+3.520418285904083344e-01,8.201420385192299545e-01
+6.327788586249427638e-01,1.326204556352319486e-01
+8.731190201719776223e-01,1.213094822729351430e-01
+1.678884446672987973e-01,4.462149309981606748e-01
+9.874783222814236261e-01,4.093413656794342437e-02
+3.494149296248078018e-01,3.005167089028528693e-01
+8.262051275325252853e-01,4.093904125930584170e-01
+6.546154192348774270e-02,3.725193910445782075e-01
+5.353288386336307880e-03,3.399701713068946640e-01
diff --git a/data/small_30.csv b/data/small_30.csv
diff --git a/utils/vtune.py b/utils/vtune.py
@@ -0,0 +1,278 @@
+import os
+import subprocess
+import time
+import numpy as np
+import matplotlib.pyplot as plt
+import re
+import pandas as pd
+
+# List of h values to test (particle size)
+h_values = [8, 16, 32, 64, 128]
+
+# Template for the sbatch script content
+sbatch_template = """#!/bin/bash
+#SBATCH -J sph
+#SBATCH -o vtune_%j.out
+#SBATCH -e vtune_%j.err
+#SBATCH -A m4776
+#SBATCH -C cpu
+#SBATCH -c {h_value}
+#SBATCH --qos=debug
+#SBATCH -t 00:15:00
+#SBATCH -N 1
+#SBATCH -n 1
+
+module load intel-oneapi/2022.1.0
+
+export SLURM_CPU_BIND="cores"
+export OMP_NUM_THREADS={h_value}
+export OMP_PROC_BIND=spread
+export OMP_PLACES=threads
+
+make clean
+make dp_numa
+
+mkdir -p $HOME/cs5220-tsp-optimization/vtune/vtune_results_{h_value}
+vtune -collect threading -result-dir $HOME/cs5220-tsp-optimization/vtune/vtune_results_{h_value} -- srun ./build/dp_numa --csv data/small_25.csv
+"""
+
+# Placeholder for job ids and their corresponding h values
+job_h_map = {}
+
+# Function to check if jobs are running
+def are_jobs_running(job_ids):
+    result = subprocess.run("squeue -u $USER", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    running_jobs = set()
+    for line in result.stdout.splitlines()[1:]:  # Skip header line
+        parts = line.split()
+        running_jobs.add(parts[0])  # Add job ID to running_jobs
+    # Return whether any of the provided job IDs are still running
+    return any(job_id in running_jobs for job_id in job_ids)
+
+# Submit programs in batches of 2
+h_iter = iter(h_values)
+batch_size = 2
+
+while True:
+    # Submit up to two jobs at a time
+    job_ids = []
+    for _ in range(batch_size):
+        try:
+            h = next(h_iter)
+        except StopIteration:
+            break
+
+        # Create a temporary sbatch file with the appropriate h value
+        sbatch_file = f"submit_vtune_job_{h}.sub"
+        with open(sbatch_file, 'w') as f:
+            f.write(sbatch_template.format(h_value=h))
+
+        # Submit the sbatch job
+        print(f"submitting program with {h} num threads...")
+        result = subprocess.run(f"sbatch {sbatch_file}", shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+
+        # Capture the job ID from the submission output
+        for line in result.stdout.splitlines():
+            if "Submitted batch job" in line:
+                job_id = line.split()[-1]
+                job_h_map[job_id] = h
+                job_ids.append(job_id)
+                print(f"job {job_id} submitted for {h} num threads")
+                submitted = True
+
+    # Wait for the jobs to complete before submitting more
+    if job_ids:
+        print(f"waiting for jobs {job_ids} to complete...")
+        while are_jobs_running(job_ids):
+            time.sleep(10)  # Check every 10 seconds
+
+        # Rename output files from vtune_<job_id>.out to vtune_h_<h>.out
+        for job_id in job_ids:
+            h = job_h_map[job_id]
+            old_output_file = f"vtune_{job_id}.out"
+            new_output_file = f"vtune_{h}_thread.out"
+            if os.path.exists(old_output_file):
+                os.rename(old_output_file, new_output_file)
+                print(f"renamed {old_output_file} to {new_output_file}")
+
+    # If all h values have been submitted, break the loop
+    if not job_ids:
+        print('all values have been submitted')
+        break
+
+# Placeholder for results: (h, num_particles, execution_time)
+results = []
+
+def finalize_vtune_results(result_dir, h):
+    if not os.path.exists(result_dir):
+        print(f"result directory {result_dir} does not exist. Skipping finalization.")
+
+    # Finalize the results
+    finalize_command = f"vtune -finalize -r {result_dir}"
+    result = subprocess.run(finalize_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    if result.returncode != 0:
+        print(f"error finalizing results for {h} processors")
+        print(result.stderr)
+    else:
+        print(f"finalized results for {h} processors")
+
+    # Save the summary report to a CSV file
+    report_file = f"/global/homes/a/acs378/cs5220-tsp-optimization/vtune/vtune_summaries/summary_{h}_thread.csv"
+    report_command = f"vtune -report summary -r {result_dir} -format csv -report-output {report_file}"
+    result = subprocess.run(report_command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True)
+    if result.returncode != 0:
+        print(f"error saving report summary for {h} processors")
+        print(result.stderr)
+    else:
+        print(f"saved report summary to {report_file}")
+
+for h in h_values:
+    finalize_vtune_results(f"/global/homes/a/acs378/cs5220-tsp-optimization/vtune/vtune_results_{h}", h)
+
+h_value_pattern = re.compile(r'summary_(\d+)_thread\.csv')
+wait_time_pattern = re.compile(r'Wait Time with poor CPU Utilization\s+(\d+\.\d+)s')
+wait_times = []
+h_values = []
+# Iterate over the files in the directory
+for filename in os.listdir('/global/homes/a/acs378/cs5220-tsp-optimization/vtune/vtune_summaries'):
+    if h_value_pattern.match(filename):
+        # Extract h_value from the filename
+        h_value_match = h_value_pattern.search(filename)
+        if h_value_match:
+            h_value = int(h_value_match.group(1))
+
+        # Read the content of the file
+        file_path = os.path.join("/global/homes/a/acs378/cs5220-tsp-optimization/vtune/vtune_summaries", filename)
+        with open(file_path, 'r') as file:
+            content = file.read()
+
+        # Extract the Wait Time with poor CPU Utilization
+        wait_time_match = wait_time_pattern.search(content)
+        if wait_time_match:
+            wait_time = float(wait_time_match.group(1))
+            print(wait_time)
+
+            # Store the h_value and wait_time
+            h_values.append(h_value)
+            wait_times.append(wait_time)
+
+# Create a DataFrame for easier plotting
+print(h_values)
+print(wait_times)
+data = pd.DataFrame({
+    'h_value': h_values,
+    'Wait Time with poor CPU Utilization': wait_times
+})
+
+# Sort data by h_value
+data = data.sort_values(by='h_value')
+
+# Plot the results
+plt.figure(figsize=(10, 6))
+plt.plot(data['h_value'], data['Wait Time with poor CPU Utilization'], '-o', label='Wait Time vs CPU Threads')
+plt.xlabel('Number of Threads')
+plt.ylabel('Wait Time with Poor CPU Utilization (s)')
+plt.title('Wait Time with Poor CPU Utilization vs Number of Threads')
+plt.grid(True)
+plt.legend()
+plt.savefig('vtune/graph.png')
+
+# Directory containing the summary files
+directory = '/global/homes/a/acs378/cs5220-tsp-optimization/vtune/vtune_summaries'  # Replace with the actual path
+
+# Lists to store extracted data
+h_values = []
+wait_times = []
+total_times = []
+serial_times = []
+cpu_utilizations = []
+
+# Regex patterns to extract metrics from the file content
+h_value_pattern = re.compile(r'summary_(\d+)_thread\.csv')
+wait_time_pattern = re.compile(r'Wait Time with poor CPU Utilization\s+(\d+\.\d+)s')
+elapsed_time_pattern = re.compile(r'Elapsed Time\s+(\d+\.\d+)')
+serial_time_pattern = re.compile(r'Serial Time \(outside parallel regions\)\s+(\d+\.\d+)s')
+cpu_utilization_pattern = re.compile(r'Effective CPU Utilization\s+(\d+\.\d+)%')
+
+# Iterate over the files in the directory
+for filename in os.listdir(directory):
+    if h_value_pattern.match(filename):
+        # Extract h_value from the filename
+        h_value_match = h_value_pattern.search(filename)
+        if h_value_match:
+            h_value = int(h_value_match.group(1))
+
+        # Read the content of the file
+        file_path = os.path.join(directory, filename)
+        with open(file_path, 'r') as file:
+            content = file.read()
+
+        # Extract the metrics
+        wait_time_match = wait_time_pattern.search(content)
+        elapsed_time_match = elapsed_time_pattern.search(content)
+        serial_time_match = serial_time_pattern.search(content)
+        cpu_utilization_match = cpu_utilization_pattern.search(content)
+
+        if wait_time_match and elapsed_time_match:
+            wait_time = float(wait_time_match.group(1))
+            elapsed_time = float(elapsed_time_match.group(1))
+            serial_time = float(serial_time_match.group(1)) if serial_time_match else None
+            cpu_utilization = float(cpu_utilization_match.group(1)) if cpu_utilization_match else None
+
+            # Store the extracted data
+            h_values.append(h_value)
+            wait_times.append(wait_time)
+            total_times.append(elapsed_time)
+            if serial_time is not None:
+                serial_times.append(serial_time)
+            if cpu_utilization is not None:
+                cpu_utilizations.append(cpu_utilization)
+
+# Create a DataFrame for easier plotting
+data = pd.DataFrame({
+    'h_value': h_values,
+    'Wait Time with poor CPU Utilization': wait_times,
+    'Elapsed Time': total_times
+})
+
+if serial_times:
+    data['Serial Time'] = serial_times
+
+if cpu_utilizations:
+    data['CPU Utilization'] = cpu_utilizations
+
+data = data.sort_values(by='h_value')
+
+# Calculate normalized wait times
+data['Normalized Wait Time'] = data['Wait Time with poor CPU Utilization'] / data['Elapsed Time']
+
+# Plot 2: Total Elapsed Time vs Number of Threads
+plt.figure(figsize=(10, 6))
+plt.plot(data['h_value'], data['Elapsed Time'], '-o', label='Total Elapsed Time')
+plt.xlabel('Number of CPU Cores/Threads (h_value)')
+plt.ylabel('Elapsed Time (s)')
+plt.title('Total Elapsed Time vs CPU Cores/Threads')
+plt.grid(True)
+plt.legend()
+plt.savefig('vtune/total_elapsed_time_v_num_threads.png')
+
+# Plot 3: Normalized Wait Time (as a fraction of total time) vs Number of Threads
+plt.figure(figsize=(10, 6))
+plt.plot(data['h_value'], data['Normalized Wait Time'], '-o', label='Normalized Wait Time (fraction of total time)')
+plt.xlabel('Number of CPU Cores/Threads (h_value)')
+plt.ylabel('Normalized Wait Time (fraction of total time)')
+plt.title('Normalized Wait Time vs CPU Cores/Threads')
+plt.grid(True)
+plt.legend()
+plt.savefig('vtune/normalized_wait_time_v_num_threads.png')
+
+# Plot 4: Serial Time vs Number of Threads (if Serial Time is available)
+if 'Serial Time' in data.columns:
+    plt.figure(figsize=(10, 6))
+    plt.plot(data['h_value'], data['Serial Time'], '-o', label='Serial Time (outside parallel regions)')
+    plt.xlabel('Number of CPU Cores/Threads (h_value)')
+    plt.ylabel('Serial Time (s)')
+    plt.title('Serial Time vs CPU Cores/Threads')
+    plt.grid(True)
+    plt.legend()
+    plt.savefig('vtune/serial_time_v_num_threads.png')