diff --git a/Makefile b/Makefile index e0def2b..27a4fee 100644 --- a/Makefile +++ b/Makefile @@ -1,23 +1,25 @@ -CC = nvcc -MPCC = nvcc -OPENMP = -CFLAGS = -O3 -NVCCFLAGS = -O3 -arch sm_30 -LIBS = -lm +CC := icc +CXX := icpc +MPCC := nvcc +CFLAGS := -O3 -std=c99 -qopenmp +CXXFLAGS := -O3 -std=c++11 -qopenmp -restrict -DTHREAD_OUTER_LOOP +NVCCFLAGS := -O3 -arch sm_30 +CXXLIBS := -lm +NVCCLIBS := -L/usr/local/cuda/lib64 -all: henry henry_serial +all: henry_cpu # henry_gpu -henry: henry.o - $(CC) -o $@ $(NVCCLIBS) -L/usr/local/cuda/lib64 henry.o +henry_gpu: henry_gpu.o + $(CC) $(NVCCFLAGS)$< $(NVCCLIBS) -o $@ -henry.o: henry.cu - $(CC) -c $(NVCCFLAGS) henry.cu +henry_gpu.o: henry_gpu.cu + $(CC) $(NVCCFLAGS) -c $< -o $@ -henry_serial: henry_serial.o - g++ -o $@ henry_serial.o -fopenmp +henry_cpu: henry_cpu.o + $(CXX) $(CXXFLAGS) $< -o $@ -henry_serial.o: henry_serial.cc - g++ -c henry_serial.cc -std=c++11 -fopenmp +henry_cpu.o: henry_cpu.cc + $(CXX) $(CXXFLAGS) -c $< -o $@ -clean: - rm *.o +clean: + rm *.o diff --git a/henry_serial.cc b/henry_cpu.cc similarity index 54% rename from henry_serial.cc rename to henry_cpu.cc index 0907bb5..4b6c0fc 100644 --- a/henry_serial.cc +++ b/henry_cpu.cc @@ -6,20 +6,22 @@ #include #include #include -#include +#include + +#include // data for atom of crystal structure -// Unit cell of crystal structure can then be stored +// Unit cell of crystal structure can then be stored // as pointer array of StructureAtom's struct StructureAtom { // Cartesian position, units: A - double x; - double y; - double z; + double * restrict x; + double * restrict y; + double * restrict z; // Lennard-Jones epsilon parameter with adsorbate - double epsilon; // units: K + double * restrict epsilon; // units: K // Lennard-Jones sigma parameter with adsorbate - double sigma; // units: A + double * restrict sigma; // units: A }; // temperature, Kelvin @@ -32,45 +34,68 @@ const double R = 8.314; // Loop over all atoms of unit cell of crystal structure // Find nearest image to methane at point (x, y, z) for application of periodic boundary conditions // Compute energy contribution due to this atom via the Lennard-Jones potential -double ComputeBoltzmannFactorAtPoint(double x, double y, double z, - StructureAtom * structureatoms, - int natoms, - double L) { +double ComputeBoltzmannFactorAtPoint(const double x, const double y, const double z, + const StructureAtom const & structureatoms, + const int natoms, const double L) +{ + // Jeff: const-qualified arguments are shared (OpenMP 4.0, 2.14.1.2). + // (x, y, z) : Cartesian coords of methane molecule // structureatoms : pointer array storing info on unit cell of crystal structure // natoms : number of atoms in crystal structure // L : box length double E = 0.0; - + + // Jeff: This would greatly benefit from AoS-to-SoA transformation... + // loop over atoms in crystal structure + #ifdef THREAD_INNER_LOOP + #pragma omp parallel for simd reduction(+:E) default(none) shared(structureatoms) + #else + #pragma omp simd + #endif for (int i = 0; i < natoms; i++) { // Compute distance from (x, y, z) to this atom // compute distances in each coordinate - double dx = x - structureatoms[i].x; - double dy = y - structureatoms[i].y; - double dz = z - structureatoms[i].z; - + double dx = x - structureatoms.x[i]; + double dy = y - structureatoms.y[i]; + double dz = z - structureatoms.z[i]; + // apply nearest image convention for periodic boundary conditions - if (dx > L / 2.0) + const double boxupper = 0.5*L; + const double boxlower = -0.5*L; +#if 0 + if (dx > boxupper) dx = dx - L; - if (dy > L / 2.0) + if (dy > boxupper) dy = dy - L; - if (dz > L / 2.0) + if (dz > boxupper) dz = dz - L; - if (dx <= -L / 2.0) + if (dx <= boxlower) dx = dx + L; - if (dy <= -L / 2.0) - dy = dy + L; - if (dy <= -L / 2.0) + if (dy <= boxlower) dy = dy + L; + if (dz <= boxlower) + dz = dz + L; +#else + dx = (dx > boxupper) ? dx-L : dx; + dx = (dx > boxupper) ? dx-L : dx; + dy = (dy > boxupper) ? dy-L : dy; + dy = (dy <= boxlower) ? dy-L : dy; + dz = (dz <= boxlower) ? dz-L : dz; + dz = (dz <= boxlower) ? dz-L : dz; +#endif // distance double r = sqrt(dx*dx + dy*dy + dz*dz); // Compute contribution to energy of adsorbate at (x, y, z) due to this atom // Lennard-Jones potential (not efficient, but for clarity) - E += 4.0 * structureatoms[i].epsilon * (pow(structureatoms[i].sigma / r, 12) - pow(structureatoms[i].sigma / r, 6)); + const double sas = structureatoms.sigma[i] / r; + const double sas6 = pow(sas,6); + const double sas12 = sas6*sas6; + E += 4 * structureatoms.epsilon[i] * (sas12-sas6); } return exp(-E / (R * T)); // return Boltzmann factor } @@ -78,13 +103,16 @@ double ComputeBoltzmannFactorAtPoint(double x, double y, double z, // Inserts a methane molecule at a random position inside the structure // Calls function to compute Boltzmann factor at this point // Stores Boltzmann factor computed at this thread in deviceBoltzmannFactors -int main(int argc, char *argv[]) { +int main(int argc, char *argv[]) +{ // take in number of MC insertions as argument if (argc != 2) { - printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code"); + printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code\n"); exit(EXIT_FAILURE); } - int ninsertions = atoi(argv[1]) * 256 * 64; // Number of Monte Carlo insertions + const int ninsertions = atoi(argv[1]) * 256 * 64; // Number of Monte Carlo insertions + + printf("Running on %d threads\n", omp_get_max_threads()); // // Energetic model for interactions of methane molecule with atoms of framework @@ -96,7 +124,7 @@ int main(int argc, char *argv[]) { epsilons["O"] = 66.884614; epsilons["C"] = 88.480032; epsilons["H"] = 57.276566; - + // Sigma parameters for Lennard-Jones potential (A) std::map sigmas; sigmas["Zn"] = 3.095775; @@ -107,7 +135,6 @@ int main(int argc, char *argv[]) { // // Import unit cell of nanoporous material IRMOF-1 // - StructureAtom * structureatoms; // store data in pointer array here // open crystal structure file std::ifstream materialfile("IRMOF-1.cssr"); if (materialfile.fail()) { @@ -121,12 +148,12 @@ int main(int argc, char *argv[]) { std::istringstream istream(line); double L; - istream >> L; + istream >> L; printf("L = %f\n", L); // waste line getline(materialfile, line); - + // get number of atoms getline(materialfile, line); int natoms; // number of atoms @@ -134,12 +161,17 @@ int main(int argc, char *argv[]) { istream.clear(); istream >> natoms; printf("%d atoms\n", natoms); - + // waste line getline(materialfile, line); // Allocate space for material atoms and epsilons/sigmas - structureatoms = (StructureAtom *) malloc(natoms * sizeof(StructureAtom)); + StructureAtom structureatoms; // store data in pointer array here + structureatoms.x = new double[natoms]; + structureatoms.y = new double[natoms]; + structureatoms.z = new double[natoms]; + structureatoms.epsilon = new double[natoms]; + structureatoms.sigma = new double[natoms]; // read atom coordinates for (int i = 0; i < natoms; i++) { @@ -153,27 +185,19 @@ int main(int argc, char *argv[]) { istream >> atomno >> element >> xf >> yf >> zf; // load structureatoms - structureatoms[i].x = L * xf; - structureatoms[i].y = L * yf; - structureatoms[i].z = L * zf; - - structureatoms[i].epsilon = epsilons[element]; - structureatoms[i].sigma = sigmas[element]; - -// printf("%d. %s, (%f, %f, %f), eps = %f, sig = %f\n", -// atomno, element.c_str(), -// structureatoms[i].x, structureatoms[i].y, structureatoms[i].z, -// structureatoms[i].epsilon, -// structureatoms[i].sigma); + structureatoms.x[i] = L * xf; + structureatoms.y[i] = L * yf; + structureatoms.z[i] = L * zf; + structureatoms.epsilon[i] = epsilons[element]; + structureatoms.sigma[i] = sigmas[element]; } - + // // Set up random number generator // - unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); - std::default_random_engine generator(seed); // default -// std::mt19937 generator(seed); // Mersenne Twister algo - std::uniform_real_distribution uniform01(0.0, 1.0); // uniformly distributed real no in [0,1] + const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count(); + + double t0 = omp_get_wtime(); // // Compute the Henry coefficient in parallel @@ -181,22 +205,36 @@ int main(int argc, char *argv[]) { // Brackets denote average over space // double KH = 0.0; // will be Henry coefficient - #pragma omp parallel for - for (int i = 0; i < ninsertions; i++) { - // generate random position in structure - double x = L * uniform01(generator); - double y = L * uniform01(generator); - double z = L * uniform01(generator); - // compute Boltzmann factor - KH += ComputeBoltzmannFactorAtPoint(x, y, z, - structureatoms, - natoms, - L); + #ifdef THREAD_OUTER_LOOP + #pragma omp parallel default(none) firstprivate(L,natoms,seed) shared(KH,structureatoms) + #endif + { + std::default_random_engine generator(seed); // default + std::uniform_real_distribution uniform01(0.0, 1.0); // uniformly distributed real no in [0,1] + + #ifdef THREAD_OUTER_LOOP + #pragma omp for reduction (+:KH) + #endif + for (int i = 0; i < ninsertions; i++) { + // generate random position in structure + double x = L * uniform01(generator); + double y = L * uniform01(generator); + double z = L * uniform01(generator); + // compute Boltzmann factor + KH += ComputeBoltzmannFactorAtPoint(x, y, z, structureatoms, natoms, L); + } } + + double t1 = omp_get_wtime(); + double dt = t1-t0; + // KH = < e^{-E/(kB/T)} > / (RT) KH = KH / (ninsertions * R * T); printf("Henry constant = %e mol/(m3 - Pa)\n", KH); printf("Number of insertions: %d\n", ninsertions); - + printf("Number of insertions per second: %lf\n", ninsertions/dt); + // compare to http://devblogs.nvidia.com/parallelforall/accelerating-materials-discovery-cuda/ + printf("FOM: %lf\n", ninsertions/dt/10000); + return EXIT_SUCCESS; } diff --git a/henry.cu b/henry_gpu.cu similarity index 100% rename from henry.cu rename to henry_gpu.cu diff --git a/monster72.png b/monster72.png new file mode 100644 index 0000000..5b04236 Binary files /dev/null and b/monster72.png differ diff --git a/plot_performance.py b/plot_performance.py index 8efe891..3cf17de 100644 --- a/plot_performance.py +++ b/plot_performance.py @@ -15,9 +15,9 @@ def get_seconds(time_string): return minutes * 60.0 + seconds # GPU -df_CUDA = pd.read_csv("GPU_performance.csv") -df_CUDA['sec'] = df_CUDA['time'].map(get_seconds) -df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64 +#df_CUDA = pd.read_csv("GPU_performance.csv") +#df_CUDA['sec'] = df_CUDA['time'].map(get_seconds) +#df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64 # OpenMP df_OpenMP = pd.read_csv("OpenMP_performance.csv") @@ -28,15 +28,16 @@ def get_seconds(time_string): fig = plt.figure() plt.xlabel("Monte Carlo insertions (thousands)") -plt.ylabel("Insertions per run time (1000/sec)") +plt.ylabel("Insertions per run time (10000/sec)") -plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 1000.0, marker='s', - color='b', markersize=10, clip_on=False, label='OpenMP (24 OpenMP threads)') -plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 1000.0, marker='o', - color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)') +plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 10000.0, marker='s', + color='b', markersize=10, clip_on=False, label='OpenMP (72 threads)') +#plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 10000.0, marker='o', +# color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)') plt.xlim([0, 8000]) -plt.ylim(ymin=0) +#plt.ylim(ymin=0) +plt.ylim([0, 1500]) plt.legend(loc='center') diff --git a/run.sh b/run.sh index 32485fa..a3bdcbf 100755 --- a/run.sh +++ b/run.sh @@ -3,31 +3,32 @@ ### ### Benchmark CUDA and OpenMP code performance ### -echo "GPUkernelcalls,time" > GPU_performance.csv +#echo "GPUkernelcalls,time" > GPU_performance.csv echo "EquivalentGPUkernelcalls,time" > OpenMP_performance.csv - -export OMP_NUM_THREADS=24 + +export KMP_AFFINITY=compact +export OMP_NUM_THREADS=72 echo "Running with $OMP_NUM_THREADS OpenMP threads" # Run codes with varying numbers of Monte Carlo insertions, $n -for n in `seq 1 10 500` -do - echo "Running with $n GPU kernel calls" - +for n in `seq 16 16 512` ; do + #echo "Running with $n GPU kernel calls" + ### # GPU code ### - t=$({ time ./henry $n >/dev/null; } |& grep real | awk '{print $2}') - echo -e "\tCUDA run time: $t" - echo "$n,$t" >> GPU_performance.csv # write results to .csv - + #t=$({ time ./henry_gpu $n >/dev/null; } |& grep real | awk '{print $2}') + #echo -e "\tCUDA run time: $t" + #echo "$n,$t" >> GPU_performance.csv # write results to .csv + ### # OpenMP code ### - t=$({ time ./henry_serial $n >/dev/null; } |& grep real | awk '{print $2}') + t=$({ time ./henry_cpu $n >/dev/null; } |& grep real | awk '{print $2}') + time ./henry_cpu $n >/dev/null; echo -e "\tOpenMP run time: $t" echo "$n,$t" >> OpenMP_performance.csv # write results to .csv done -python plot_performance.py +#python plot_performance.py