diff --git a/Makefile b/Makefile
index e0def2b..27a4fee 100644
--- a/Makefile
+++ b/Makefile
@@ -1,23 +1,25 @@
-CC = nvcc
-MPCC = nvcc
-OPENMP = 
-CFLAGS = -O3
-NVCCFLAGS = -O3 -arch sm_30
-LIBS = -lm
+CC        := icc
+CXX       := icpc
+MPCC 	  := nvcc
+CFLAGS    := -O3 -std=c99 -qopenmp
+CXXFLAGS  := -O3 -std=c++11 -qopenmp -restrict -DTHREAD_OUTER_LOOP
+NVCCFLAGS := -O3 -arch sm_30
+CXXLIBS   := -lm
+NVCCLIBS  := -L/usr/local/cuda/lib64
 
-all: henry henry_serial
+all: henry_cpu # henry_gpu
 
-henry: henry.o
-	$(CC) -o $@ $(NVCCLIBS) -L/usr/local/cuda/lib64 henry.o 
+henry_gpu: henry_gpu.o
+	$(CC) $(NVCCFLAGS)$< $(NVCCLIBS) -o $@
 
-henry.o: henry.cu 
-	$(CC) -c $(NVCCFLAGS) henry.cu
+henry_gpu.o: henry_gpu.cu
+	$(CC) $(NVCCFLAGS) -c $< -o $@
 
-henry_serial: henry_serial.o
-	g++ -o $@ henry_serial.o -fopenmp
+henry_cpu: henry_cpu.o
+	$(CXX) $(CXXFLAGS) $< -o $@
 
-henry_serial.o: henry_serial.cc
-	g++ -c henry_serial.cc -std=c++11 -fopenmp
+henry_cpu.o: henry_cpu.cc
+	$(CXX) $(CXXFLAGS) -c $< -o $@
 
-clean: 
-	rm *.o 
+clean:
+	rm *.o
diff --git a/henry_serial.cc b/henry_cpu.cc
similarity index 54%
rename from henry_serial.cc
rename to henry_cpu.cc
index 0907bb5..4b6c0fc 100644
--- a/henry_serial.cc
+++ b/henry_cpu.cc
@@ -6,20 +6,22 @@
 #include <cstdlib>
 #include <sstream>
 #include <map>
-#include<random>
+#include <random>
+
+#include <omp.h>
 
 // data for atom of crystal structure
-//    Unit cell of crystal structure can then be stored 
+//    Unit cell of crystal structure can then be stored
 //    as pointer array of StructureAtom's
 struct StructureAtom {
     // Cartesian position, units: A
-    double x;
-    double y;
-    double z;
+    double * restrict x;
+    double * restrict y;
+    double * restrict z;
     // Lennard-Jones epsilon parameter with adsorbate
-    double epsilon;  // units: K
+    double * restrict epsilon;  // units: K
     // Lennard-Jones sigma parameter with adsorbate
-    double sigma;  // units: A
+    double * restrict sigma;  // units: A
 };
 
 // temperature, Kelvin
@@ -32,45 +34,68 @@ const double R = 8.314;
 //   Loop over all atoms of unit cell of crystal structure
 //   Find nearest image to methane at point (x, y, z) for application of periodic boundary conditions
 //   Compute energy contribution due to this atom via the Lennard-Jones potential
-double ComputeBoltzmannFactorAtPoint(double x, double y, double z,
-                                       StructureAtom * structureatoms,
-                                       int natoms,
-                                       double L) {
+double ComputeBoltzmannFactorAtPoint(const double x, const double y, const double z,
+                                     const StructureAtom const & structureatoms,
+                                     const int natoms, const double L)
+{
+    // Jeff: const-qualified arguments are shared (OpenMP 4.0, 2.14.1.2).
+
     // (x, y, z) : Cartesian coords of methane molecule
     // structureatoms : pointer array storing info on unit cell of crystal structure
     // natoms : number of atoms in crystal structure
     // L : box length
     double E = 0.0;
-    
+
+    // Jeff: This would greatly benefit from AoS-to-SoA transformation...
+
     // loop over atoms in crystal structure
+    #ifdef THREAD_INNER_LOOP
+    #pragma omp parallel for simd reduction(+:E) default(none) shared(structureatoms)
+    #else
+    #pragma omp simd
+    #endif
     for (int i = 0; i < natoms; i++) {
         //  Compute distance from (x, y, z) to this atom
 
         // compute distances in each coordinate
-        double dx = x - structureatoms[i].x;
-        double dy = y - structureatoms[i].y;
-        double dz = z - structureatoms[i].z;
-        
+        double dx = x - structureatoms.x[i];
+        double dy = y - structureatoms.y[i];
+        double dz = z - structureatoms.z[i];
+
         // apply nearest image convention for periodic boundary conditions
-        if (dx > L / 2.0)
+        const double boxupper = 0.5*L;
+        const double boxlower = -0.5*L;
+#if 0
+        if (dx > boxupper)
             dx = dx - L;
-        if (dy > L / 2.0)
+        if (dy > boxupper)
             dy = dy - L;
-        if (dz > L / 2.0)
+        if (dz > boxupper)
             dz = dz - L;
-        if (dx <= -L / 2.0)
+        if (dx <= boxlower)
             dx = dx + L;
-        if (dy <= -L / 2.0)
-            dy = dy + L;
-        if (dy <= -L / 2.0)
+        if (dy <= boxlower)
             dy = dy + L;
+        if (dz <= boxlower)
+            dz = dz + L;
+#else
+        dx = (dx >  boxupper) ? dx-L : dx;
+        dx = (dx >  boxupper) ? dx-L : dx;
+        dy = (dy >  boxupper) ? dy-L : dy;
+        dy = (dy <= boxlower) ? dy-L : dy;
+        dz = (dz <= boxlower) ? dz-L : dz;
+        dz = (dz <= boxlower) ? dz-L : dz;
+#endif
 
         // distance
         double r = sqrt(dx*dx + dy*dy + dz*dz);
 
         // Compute contribution to energy of adsorbate at (x, y, z) due to this atom
         // Lennard-Jones potential (not efficient, but for clarity)
-        E += 4.0 * structureatoms[i].epsilon * (pow(structureatoms[i].sigma / r, 12) - pow(structureatoms[i].sigma / r, 6));
+        const double sas   = structureatoms.sigma[i] / r;
+        const double sas6  = pow(sas,6);
+        const double sas12 = sas6*sas6;
+        E += 4 * structureatoms.epsilon[i] * (sas12-sas6);
     }
     return exp(-E / (R * T));  // return Boltzmann factor
 }
@@ -78,13 +103,16 @@ double ComputeBoltzmannFactorAtPoint(double x, double y, double z,
 // Inserts a methane molecule at a random position inside the structure
 // Calls function to compute Boltzmann factor at this point
 // Stores Boltzmann factor computed at this thread in deviceBoltzmannFactors
-int main(int argc, char *argv[]) {
+int main(int argc, char *argv[])
+{
     // take in number of MC insertions as argument
     if (argc != 2) {
-        printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code");
+        printf("Run as:\n./henry ninsertions\nwhere ninsertions = Number of MC insertions / (256 * 64) to correspond to CUDA code\n");
         exit(EXIT_FAILURE);
     }
-    int ninsertions = atoi(argv[1]) * 256 * 64;  // Number of Monte Carlo insertions
+    const int ninsertions = atoi(argv[1]) * 256 * 64;  // Number of Monte Carlo insertions
+
+    printf("Running on %d threads\n", omp_get_max_threads());
 
     //
     // Energetic model for interactions of methane molecule with atoms of framework
@@ -96,7 +124,7 @@ int main(int argc, char *argv[]) {
     epsilons["O"] = 66.884614;
     epsilons["C"] = 88.480032;
     epsilons["H"] = 57.276566;
-    
+
     // Sigma parameters for Lennard-Jones potential (A)
     std::map<std::string, double> sigmas;
     sigmas["Zn"] = 3.095775;
@@ -107,7 +135,6 @@ int main(int argc, char *argv[]) {
     //
     // Import unit cell of nanoporous material IRMOF-1
     //
-    StructureAtom * structureatoms;  // store data in pointer array here
     // open crystal structure file
     std::ifstream materialfile("IRMOF-1.cssr");
     if (materialfile.fail()) {
@@ -121,12 +148,12 @@ int main(int argc, char *argv[]) {
     std::istringstream istream(line);
 
     double L;
-    istream >> L;   
+    istream >> L;
     printf("L = %f\n", L);
 
     // waste line
     getline(materialfile, line);
-    
+
     // get number of atoms
     getline(materialfile, line);
     int natoms;  // number of atoms
@@ -134,12 +161,17 @@ int main(int argc, char *argv[]) {
     istream.clear();
     istream >> natoms;
     printf("%d atoms\n", natoms);
-    
+
     // waste line
     getline(materialfile, line);
 
     // Allocate space for material atoms and epsilons/sigmas
-    structureatoms = (StructureAtom *) malloc(natoms * sizeof(StructureAtom));
+    StructureAtom structureatoms;  // store data in pointer array here
+    structureatoms.x       = new double[natoms];
+    structureatoms.y       = new double[natoms];
+    structureatoms.z       = new double[natoms];
+    structureatoms.epsilon = new double[natoms];
+    structureatoms.sigma   = new double[natoms];
 
     // read atom coordinates
     for (int i = 0; i < natoms; i++) {
@@ -153,27 +185,19 @@ int main(int argc, char *argv[]) {
 
         istream >> atomno >> element >> xf >> yf >> zf;
         // load structureatoms
-        structureatoms[i].x = L * xf;
-        structureatoms[i].y = L * yf;
-        structureatoms[i].z = L * zf;
-
-        structureatoms[i].epsilon = epsilons[element];
-        structureatoms[i].sigma = sigmas[element];
-
-//        printf("%d. %s, (%f, %f, %f), eps = %f, sig = %f\n", 
-//            atomno, element.c_str(), 
-//            structureatoms[i].x, structureatoms[i].y, structureatoms[i].z,
-//            structureatoms[i].epsilon,
-//            structureatoms[i].sigma);
+        structureatoms.x[i]       = L * xf;
+        structureatoms.y[i]       = L * yf;
+        structureatoms.z[i]       = L * zf;
+        structureatoms.epsilon[i] = epsilons[element];
+        structureatoms.sigma[i]   = sigmas[element];
     }
-    
+
     //
     // Set up random number generator
     //
-    unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
-    std::default_random_engine generator(seed);  // default
-//    std::mt19937 generator(seed);  // Mersenne Twister algo
-    std::uniform_real_distribution<double> uniform01(0.0, 1.0); // uniformly distributed real no in [0,1]
+    const unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
+
+    double t0 = omp_get_wtime();
 
     //
     //  Compute the Henry coefficient in parallel
@@ -181,22 +205,36 @@ int main(int argc, char *argv[]) {
     //  Brackets denote average over space
     //
     double KH = 0.0;  // will be Henry coefficient
-    #pragma omp parallel for
-    for (int i = 0; i < ninsertions; i++) {
-        // generate random position in structure
-        double x = L * uniform01(generator);
-        double y = L * uniform01(generator);
-        double z = L * uniform01(generator);
-        // compute Boltzmann factor
-        KH += ComputeBoltzmannFactorAtPoint(x, y, z,
-                                       structureatoms,
-                                       natoms,
-                                       L);
+    #ifdef THREAD_OUTER_LOOP
+    #pragma omp parallel default(none) firstprivate(L,natoms,seed) shared(KH,structureatoms)
+    #endif
+    {
+        std::default_random_engine generator(seed);  // default
+        std::uniform_real_distribution<double> uniform01(0.0, 1.0); // uniformly distributed real no in [0,1]
+
+        #ifdef THREAD_OUTER_LOOP
+        #pragma omp for reduction (+:KH)
+        #endif
+        for (int i = 0; i < ninsertions; i++) {
+            // generate random position in structure
+            double x = L * uniform01(generator);
+            double y = L * uniform01(generator);
+            double z = L * uniform01(generator);
+            // compute Boltzmann factor
+            KH += ComputeBoltzmannFactorAtPoint(x, y, z, structureatoms, natoms, L);
+        }
     }
+
+    double t1 = omp_get_wtime();
+    double dt = t1-t0;
+
     // KH = < e^{-E/(kB/T)} > / (RT)
     KH = KH / (ninsertions * R * T);
     printf("Henry constant = %e mol/(m3 - Pa)\n", KH);
     printf("Number of insertions: %d\n", ninsertions);
-    
+    printf("Number of insertions per second: %lf\n", ninsertions/dt);
+    // compare to http://devblogs.nvidia.com/parallelforall/accelerating-materials-discovery-cuda/
+    printf("FOM: %lf\n", ninsertions/dt/10000);
+
     return EXIT_SUCCESS;
 }
diff --git a/henry.cu b/henry_gpu.cu
similarity index 100%
rename from henry.cu
rename to henry_gpu.cu
diff --git a/monster72.png b/monster72.png
new file mode 100644
index 0000000..5b04236
Binary files /dev/null and b/monster72.png differ
diff --git a/plot_performance.py b/plot_performance.py
index 8efe891..3cf17de 100644
--- a/plot_performance.py
+++ b/plot_performance.py
@@ -15,9 +15,9 @@ def get_seconds(time_string):
     return minutes * 60.0 + seconds
 
 # GPU
-df_CUDA = pd.read_csv("GPU_performance.csv")
-df_CUDA['sec'] = df_CUDA['time'].map(get_seconds)
-df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64
+#df_CUDA = pd.read_csv("GPU_performance.csv")
+#df_CUDA['sec'] = df_CUDA['time'].map(get_seconds)
+#df_CUDA['ninsertions'] = df_CUDA['GPUkernelcalls'] * 256 * 64
 
 # OpenMP
 df_OpenMP = pd.read_csv("OpenMP_performance.csv")
@@ -28,15 +28,16 @@ def get_seconds(time_string):
 fig = plt.figure()
 
 plt.xlabel("Monte Carlo insertions (thousands)")
-plt.ylabel("Insertions per run time (1000/sec)")
+plt.ylabel("Insertions per run time (10000/sec)")
 
-plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 1000.0, marker='s', 
-        color='b', markersize=10, clip_on=False, label='OpenMP (24 OpenMP threads)')
-plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 1000.0, marker='o', 
-        color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)')
+plt.plot(df_OpenMP["ninsertions"] / 1000.0, df_OpenMP["ninsertions"] / df_OpenMP["sec"] / 10000.0, marker='s', 
+        color='b', markersize=10, clip_on=False, label='OpenMP (72 threads)')
+#plt.plot(df_CUDA["ninsertions"] / 1000.0, df_CUDA["ninsertions"] / df_CUDA["sec"] / 10000.0, marker='o', 
+#        color='g', markersize=10, clip_on=False, label='CUDA (64 blocks, 256 threads)')
 
 plt.xlim([0, 8000])
-plt.ylim(ymin=0)
+#plt.ylim(ymin=0)
+plt.ylim([0, 1500])
 
 plt.legend(loc='center')
 
diff --git a/run.sh b/run.sh
index 32485fa..a3bdcbf 100755
--- a/run.sh
+++ b/run.sh
@@ -3,31 +3,32 @@
 ###
 ### Benchmark CUDA and OpenMP code performance
 ###
-echo "GPUkernelcalls,time" > GPU_performance.csv
+#echo "GPUkernelcalls,time" > GPU_performance.csv
 echo "EquivalentGPUkernelcalls,time" > OpenMP_performance.csv
-    
-export OMP_NUM_THREADS=24
+
+export KMP_AFFINITY=compact
+export OMP_NUM_THREADS=72
 echo "Running with $OMP_NUM_THREADS OpenMP threads"
 
 # Run codes with varying numbers of Monte Carlo insertions, $n
-for n in `seq 1 10 500`
-do
-    echo "Running with $n GPU kernel calls"
-    
+for n in `seq 16 16 512` ; do
+    #echo "Running with $n GPU kernel calls"
+
     ###
     # GPU code
     ###
-    t=$({ time ./henry $n >/dev/null; } |& grep real | awk '{print $2}')
-    echo -e "\tCUDA run time: $t"
-    echo "$n,$t" >> GPU_performance.csv  # write results to .csv
-    
+    #t=$({ time ./henry_gpu $n >/dev/null; } |& grep real | awk '{print $2}')
+    #echo -e "\tCUDA run time: $t"
+    #echo "$n,$t" >> GPU_performance.csv  # write results to .csv
+
     ###
     # OpenMP code
     ###
-    t=$({ time ./henry_serial $n >/dev/null; } |& grep real | awk '{print $2}')
+    t=$({ time ./henry_cpu $n >/dev/null; } |& grep real | awk '{print $2}')
+    time ./henry_cpu $n >/dev/null;
     echo -e "\tOpenMP run time: $t"
     echo "$n,$t" >> OpenMP_performance.csv  # write results to .csv
 
 done
 
-python plot_performance.py
+#python plot_performance.py