Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AoS-to-SoA conversion #4

Open
wants to merge 20 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 19 additions & 17 deletions Makefile
Original file line number Diff line number Diff line change
@@ -1,23 +1,25 @@
CC = nvcc
MPCC = nvcc
OPENMP =
CFLAGS = -O3
NVCCFLAGS = -O3 -arch sm_30
LIBS = -lm
CC := icc
CXX := icpc
MPCC := nvcc
CFLAGS := -O3 -std=c99 -qopenmp
CXXFLAGS := -O3 -std=c++11 -qopenmp -restrict -DTHREAD_OUTER_LOOP
NVCCFLAGS := -O3 -arch sm_30
CXXLIBS := -lm
NVCCLIBS := -L/usr/local/cuda/lib64

all: henry henry_serial
all: henry_cpu # henry_gpu

henry: henry.o
$(CC) -o $@ $(NVCCLIBS) -L/usr/local/cuda/lib64 henry.o
henry_gpu: henry_gpu.o
$(CC) $(NVCCFLAGS)$< $(NVCCLIBS) -o $@

henry.o: henry.cu
$(CC) -c $(NVCCFLAGS) henry.cu
henry_gpu.o: henry_gpu.cu
$(CC) $(NVCCFLAGS) -c $< -o $@

henry_serial: henry_serial.o
g++ -o $@ henry_serial.o -fopenmp
henry_cpu: henry_cpu.o
$(CXX) $(CXXFLAGS) $< -o $@

henry_serial.o: henry_serial.cc
g++ -c henry_serial.cc -std=c++11 -fopenmp
henry_cpu.o: henry_cpu.cc
$(CXX) $(CXXFLAGS) -c $< -o $@

clean:
rm *.o
clean:
rm *.o
162 changes: 100 additions & 62 deletions henry_serial.cc → henry_cpu.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,22 @@
#include <cstdlib>
#include <sstream>
#include <map>
#include<random>
#include <random>

#include <omp.h>

// data for atom of crystal structure
// Unit cell of crystal structure can then be stored
// Unit cell of crystal structure can then be stored
// as pointer array of StructureAtom's
struct StructureAtom {
// Cartesian position, units: A
double x;
double y;
double z;
double * restrict x;
double * restrict y;
double * restrict z;
// Lennard-Jones epsilon parameter with adsorbate
double epsilon; // units: K
double * restrict epsilon; // units: K
// Lennard-Jones sigma parameter with adsorbate
double sigma; // units: A
double * restrict sigma; // units: A
};

// temperature, Kelvin
Expand All @@ -32,59 +34,85 @@ const double R = 8.314;
// Loop over all atoms of unit cell of crystal structure
// Find nearest image to methane at point (x, y, z) for application of periodic boundary conditions
// Compute energy contribution due to this atom via the Lennard-Jones potential
double ComputeBoltzmannFactorAtPoint(double x, double y, double z,
StructureAtom * structureatoms,
int natoms,
double L) {
double ComputeBoltzmannFactorAtPoint(const double x, const double y, const double z,
const StructureAtom const & structureatoms,
const int natoms, const double L)
{
// Jeff: const-qualified arguments are shared (OpenMP 4.0, 2.14.1.2).

// (x, y, z) : Cartesian coords of methane molecule
// structureatoms : pointer array storing info on unit cell of crystal structure
// natoms : number of atoms in crystal structure
// L : box length
double E = 0.0;


// Jeff: This would greatly benefit from AoS-to-SoA transformation...

// loop over atoms in crystal structure
#ifdef THREAD_INNER_LOOP
#pragma omp parallel for simd reduction(+:E) default(none) shared(structureatoms)
#else
#pragma omp simd
#endif
for (int i = 0; i < natoms; i++) {
// Compute distance from (x, y, z) to this atom

// compute distances in each coordinate
double dx = x - structureatoms[i].x;
double dy = y - structureatoms[i].y;
double dz = z - structureatoms[i].z;
double dx = x - structureatoms.x[i];
double dy = y - structureatoms.y[i];
double dz = z - structureatoms.z[i];

// apply nearest image convention for periodic boundary conditions
if (dx > L / 2.0)
const double boxupper = 0.5*L;
const double boxlower = -0.5*L;
#if 0
if (dx > boxupper)
dx = dx - L;
if (dy > L / 2.0)
if (dy > boxupper)
dy = dy - L;
if (dz > L / 2.0)
if (dz > boxupper)
dz = dz - L;
if (dx <= -L / 2.0)
if (dx <= boxlower)
dx = dx + L;
if (dy <= -L / 2.0)
dy = dy + L;
if (dy <= -L / 2.0)
if (dy <= boxlower)
dy = dy + L;
if (dz <= boxlower)
dz = dz + L;
#else
dx = (dx > boxupper) ? dx-L : dx;
dx = (dx > boxupper) ? dx-L : dx;
dy = (dy > boxupper) ? dy-L : dy;
dy = (dy <= boxlower) ? dy-L : dy;
dz = (dz <= boxlower) ? dz-L : dz;
dz = (dz <= boxlower) ? dz-L : dz;
#endif

// distance
double r = sqrt(dx*dx + dy*dy + dz*dz);

// Compute contribution to energy of adsorbate at (x, y, z) due to this atom
// Lennard-Jones potential (not efficient, but for clarity)
E += 4.0 * structureatoms[i].epsilon * (pow(structureatoms[i].sigma / r, 12) - pow(structureatoms[i].sigma / r, 6));
const double sas = structureatoms.sigma[i] / r;
const double sas6 = pow(sas,6);
const double sas12 = sas6*sas6;
E += 4 * structureatoms.epsilon[i] * (sas12-sas6);
}
return exp(-E / (R * T)); // return Boltzmann factor
}

// Inserts a methane molecule at a random position inside the structure
// Calls function to compute Boltzmann factor at this point
// Stores Boltzmann factor computed at this thread in deviceBoltzmannFactors
int main(int argc, char *argv[]) {
int main(int argc, char *argv[])
{
// take in number of MC insertions as argument
if (argc != 2) {
printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code");
printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code\n");
exit(EXIT_FAILURE);
}
int ninsertions = atoi(argv[1]) * 256 * 64; // Number of Monte Carlo insertions
const int ninsertions = atoi(argv[1]) * 256 * 64; // Number of Monte Carlo insertions

printf("Running on %d threads\n", omp_get_max_threads());

//
// Energetic model for interactions of methane molecule with atoms of framework
Expand All @@ -96,7 +124,7 @@ int main(int argc, char *argv[]) {
epsilons["O"] = 66.884614;
epsilons["C"] = 88.480032;
epsilons["H"] = 57.276566;

// Sigma parameters for Lennard-Jones potential (A)
std::map<std::string, double> sigmas;
sigmas["Zn"] = 3.095775;
Expand All @@ -107,7 +135,6 @@ int main(int argc, char *argv[]) {
//
// Import unit cell of nanoporous material IRMOF-1
//
StructureAtom * structureatoms; // store data in pointer array here
// open crystal structure file
std::ifstream materialfile("IRMOF-1.cssr");
if (materialfile.fail()) {
Expand All @@ -121,25 +148,30 @@ int main(int argc, char *argv[]) {
std::istringstream istream(line);

double L;
istream >> L;
istream >> L;
printf("L = %f\n", L);

// waste line
getline(materialfile, line);

// get number of atoms
getline(materialfile, line);
int natoms; // number of atoms
istream.str(line);
istream.clear();
istream >> natoms;
printf("%d atoms\n", natoms);

// waste line
getline(materialfile, line);

// Allocate space for material atoms and epsilons/sigmas
structureatoms = (StructureAtom *) malloc(natoms * sizeof(StructureAtom));
StructureAtom structureatoms; // store data in pointer array here
structureatoms.x = new double[natoms];
structureatoms.y = new double[natoms];
structureatoms.z = new double[natoms];
structureatoms.epsilon = new double[natoms];
structureatoms.sigma = new double[natoms];

// read atom coordinates
for (int i = 0; i < natoms; i++) {
Expand All @@ -153,50 +185,56 @@ int main(int argc, char *argv[]) {

istream >> atomno >> element >> xf >> yf >> zf;
// load structureatoms
structureatoms[i].x = L * xf;
structureatoms[i].y = L * yf;
structureatoms[i].z = L * zf;

structureatoms[i].epsilon = epsilons[element];
structureatoms[i].sigma = sigmas[element];

// printf("%d. %s, (%f, %f, %f), eps = %f, sig = %f\n",
// atomno, element.c_str(),
// structureatoms[i].x, structureatoms[i].y, structureatoms[i].z,
// structureatoms[i].epsilon,
// structureatoms[i].sigma);
structureatoms.x[i] = L * xf;
structureatoms.y[i] = L * yf;
structureatoms.z[i] = L * zf;
structureatoms.epsilon[i] = epsilons[element];
structureatoms.sigma[i] = sigmas[element];
}

//
// Set up random number generator
//
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::default_random_engine generator(seed); // default
// std::mt19937 generator(seed); // Mersenne Twister algo
std::uniform_real_distribution<double> uniform01(0.0, 1.0); // uniformly distributed real no in [0,1]
const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();

double t0 = omp_get_wtime();

//
// Compute the Henry coefficient in parallel
// KH = < e^{-E/(kB * T)} > / (R * T)
// Brackets denote average over space
//
double KH = 0.0; // will be Henry coefficient
#pragma omp parallel for
for (int i = 0; i < ninsertions; i++) {
// generate random position in structure
double x = L * uniform01(generator);
double y = L * uniform01(generator);
double z = L * uniform01(generator);
// compute Boltzmann factor
KH += ComputeBoltzmannFactorAtPoint(x, y, z,
structureatoms,
natoms,
L);
#ifdef THREAD_OUTER_LOOP
#pragma omp parallel default(none) firstprivate(L,natoms,seed) shared(KH,structureatoms)
#endif
{
std::default_random_engine generator(seed); // default
std::uniform_real_distribution<double> uniform01(0.0, 1.0); // uniformly distributed real no in [0,1]

#ifdef THREAD_OUTER_LOOP
#pragma omp for reduction (+:KH)
#endif
for (int i = 0; i < ninsertions; i++) {
// generate random position in structure
double x = L * uniform01(generator);
double y = L * uniform01(generator);
double z = L * uniform01(generator);
// compute Boltzmann factor
KH += ComputeBoltzmannFactorAtPoint(x, y, z, structureatoms, natoms, L);
}
}

double t1 = omp_get_wtime();
double dt = t1-t0;

// KH = < e^{-E/(kB/T)} > / (RT)
KH = KH / (ninsertions * R * T);
printf("Henry constant = %e mol/(m3 - Pa)\n", KH);
printf("Number of insertions: %d\n", ninsertions);

printf("Number of insertions per second: %lf\n", ninsertions/dt);
// compare to http://devblogs.nvidia.com/parallelforall/accelerating-materials-discovery-cuda/
printf("FOM: %lf\n", ninsertions/dt/10000);

return EXIT_SUCCESS;
}
File renamed without changes.
Binary file added monster72.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
19 changes: 10 additions & 9 deletions plot_performance.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@ def get_seconds(time_string):
return minutes * 60.0 + seconds

# GPU
df_CUDA = pd.read_csv("GPU_performance.csv")
df_CUDA['sec'] = df_CUDA['time'].map(get_seconds)
df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64
#df_CUDA = pd.read_csv("GPU_performance.csv")
#df_CUDA['sec'] = df_CUDA['time'].map(get_seconds)
#df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64

# OpenMP
df_OpenMP = pd.read_csv("OpenMP_performance.csv")
Expand All @@ -28,15 +28,16 @@ def get_seconds(time_string):
fig = plt.figure()

plt.xlabel("Monte Carlo insertions (thousands)")
plt.ylabel("Insertions per run time (1000/sec)")
plt.ylabel("Insertions per run time (10000/sec)")

plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 1000.0, marker='s',
color='b', markersize=10, clip_on=False, label='OpenMP (24 OpenMP threads)')
plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 1000.0, marker='o',
color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)')
plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 10000.0, marker='s',
color='b', markersize=10, clip_on=False, label='OpenMP (72 threads)')
#plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 10000.0, marker='o',
# color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)')

plt.xlim([0, 8000])
plt.ylim(ymin=0)
#plt.ylim(ymin=0)
plt.ylim([0, 1500])

plt.legend(loc='center')

Expand Down
27 changes: 14 additions & 13 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,32 @@
###
### Benchmark CUDA and OpenMP code performance
###
echo "GPUkernelcalls,time" > GPU_performance.csv
#echo "GPUkernelcalls,time" > GPU_performance.csv
echo "EquivalentGPUkernelcalls,time" > OpenMP_performance.csv

export OMP_NUM_THREADS=24

export KMP_AFFINITY=compact
export OMP_NUM_THREADS=72
echo "Running with $OMP_NUM_THREADS OpenMP threads"

# Run codes with varying numbers of Monte Carlo insertions, $n
for n in `seq 1 10 500`
do
echo "Running with $n GPU kernel calls"

for n in `seq 16 16 512` ; do
#echo "Running with $n GPU kernel calls"

###
# GPU code
###
t=$({ time ./henry $n >/dev/null; } |& grep real | awk '{print $2}')
echo -e "\tCUDA run time: $t"
echo "$n,$t" >> GPU_performance.csv # write results to .csv
#t=$({ time ./henry_gpu $n >/dev/null; } |& grep real | awk '{print $2}')
#echo -e "\tCUDA run time: $t"
#echo "$n,$t" >> GPU_performance.csv # write results to .csv

###
# OpenMP code
###
t=$({ time ./henry_serial $n >/dev/null; } |& grep real | awk '{print $2}')
t=$({ time ./henry_cpu $n >/dev/null; } |& grep real | awk '{print $2}')
time ./henry_cpu $n >/dev/null;
echo -e "\tOpenMP run time: $t"
echo "$n,$t" >> OpenMP_performance.csv # write results to .csv

done

python plot_performance.py
#python plot_performance.py