Skip to content

Commit

Permalink
Further Optimization and Speedup
Browse files Browse the repository at this point in the history
1. Further Optimized Ewald Summation Initialization, speed is increased by 10-20% for simulations
   * Examples:
     * CO2-MFI: 9.7 → 8.3 seconds
     * Bae-Mixture: 108.8 → 97.2 seconds
     * NU-2000: 17.20 → 7.97 seconds
     * NPTMC: 223.73 → 180.81 seconds
     * NVT-Gibbs (100 + 100 cycles): 35.3 → 31.82 seconds
2. Small update for Compilation file: NVC_COMPILE
   * added echoes
  • Loading branch information
Zhaoli2042 committed Oct 13, 2024
1 parent 670b446 commit 2afac95
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 95 deletions.
12 changes: 6 additions & 6 deletions NVC_COMPILE
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,16 @@ LINKFLAG="-L/opt/nvidia/hpc_sdk/Linux_x86_64/22.5/cuda/lib64 -L/usr/lib64/ -L/op

NVCFLAG="-O3 -std=c++20 -target=gpu -Minline -fopenmp -cuda -stdpar=multicore"

$CXX $NVCFLAG $LINKFLAG -c axpy.cu
$CXX $NVCFLAG $LINKFLAG -c axpy.cu ; echo "Finished axpy.cu"

$CXX $NVCFLAG $LINKFLAG -c main.cpp
$CXX $NVCFLAG $LINKFLAG -c main.cpp ; echo "Finished main.cpp"

$CXX $NVCFLAG $LINKFLAG -c read_data.cpp
$CXX $NVCFLAG $LINKFLAG -c read_data.cpp ; echo "Finished read_data.cpp"

$CXX $NVCFLAG $LINKFLAG -c data_struct.cpp
$CXX $NVCFLAG $LINKFLAG -c data_struct.cpp ; echo "Finished data_struct.cpp"

$CXX $NVCFLAG $LINKFLAG -c VDW_Coulomb.cu
$CXX $NVCFLAG $LINKFLAG -c VDW_Coulomb.cu ; echo "Finished VDW_Coulomb.cu"

$CXX $NVCFLAG $LINKFLAG main.o read_data.o axpy.o data_struct.o VDW_Coulomb.o -o nvc_main.x
$CXX $NVCFLAG $LINKFLAG main.o read_data.o axpy.o data_struct.o VDW_Coulomb.o -o nvc_main.x ; echo "Finished Linking"

rm *.o
103 changes: 14 additions & 89 deletions src_clean/Ewald_Energy_Functions.h
Original file line number Diff line number Diff line change
Expand Up @@ -94,44 +94,12 @@ __device__ void Initialize_Vectors(Boxsize Box, size_t Oldsize, size_t Newsize,
}
}

__device__ void Initialize_Vectors_SPECIAL(Boxsize Box, size_t Oldsize, size_t Newsize, Atoms Old, size_t numberOfAtoms, int3 kmax)
{
int kx_max = kmax.x;
int ky_max = kmax.y;
int kz_max = kmax.z;
// Calculate remaining positive kx, ky and kz by recurrence
for(size_t kx = 2; kx <= kx_max; ++kx)
{
for(size_t i = 0; i != numberOfAtoms; ++i)
{
Box.eik_x[i + kx * numberOfAtoms] = multiply(Box.eik_x[i + (kx - 1) * numberOfAtoms], Box.eik_x[i + 1 * numberOfAtoms]);
}
}
for(size_t ky = 2; ky <= ky_max; ++ky)
{
for(size_t i = 0; i != numberOfAtoms; ++i)
{
Box.eik_y[i + ky * numberOfAtoms] = multiply(Box.eik_y[i + (ky - 1) * numberOfAtoms], Box.eik_y[i + 1 * numberOfAtoms]);
}
}
for(size_t kz = 2; kz <= kz_max; ++kz)
{
for(size_t i = 0; i != numberOfAtoms; ++i)
{
Box.eik_z[i + kz * numberOfAtoms] = multiply(Box.eik_z[i + (kz - 1) * numberOfAtoms], Box.eik_z[i + 1 * numberOfAtoms]);
}
}
}

__device__ void Initialize_Vectors_thread(Complex* eik, size_t numberOfAtoms, int k_max)
__device__ void Initialize_Vectors_thread(Complex* eik, size_t numberOfAtoms, int k_max, size_t i)
{
// Calculate remaining positive kx, ky and kz by recurrence
for(size_t k = 2; k <= k_max; ++k)
{
for(size_t i = 0; i != numberOfAtoms; ++i)
{
eik[i + k * numberOfAtoms] = multiply(eik[i + (k - 1) * numberOfAtoms], eik[i + 1 * numberOfAtoms]);
}
eik[i + k * numberOfAtoms] = multiply(eik[i + (k - 1) * numberOfAtoms], eik[i + 1 * numberOfAtoms]);
}
}

Expand Down Expand Up @@ -185,20 +153,9 @@ __global__ void Initialize_WaveVector_General(Boxsize Box, int3 kmax, Atoms* d_a
tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[ij + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[ij + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[ij + 1 * numberOfAtoms] = tempcomplex;
}
__syncthreads();

if(ij == 0)
{
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x);
}
else if(ij == 1)
{
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y);
}
else if(ij == 2)
{
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z);
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x, ij);
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y, ij);
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z, ij);
}
}

Expand Down Expand Up @@ -239,19 +196,9 @@ __global__ void Initialize_WaveVector_Reinsertion(Boxsize Box, int3 kmax, double
tempcomplex.real = std::cos(s.x); tempcomplex.imag = std::sin(s.x); Box.eik_x[ij + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[ij + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[ij + 1 * numberOfAtoms] = tempcomplex;
}
__syncthreads();
if(ij == 0)
{
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x);
}
else if(ij == 1)
{
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y);
}
else if(ij == 2)
{
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z);
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x, ij);
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y, ij);
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z, ij);
}
}

Expand Down Expand Up @@ -291,20 +238,9 @@ __global__ void Initialize_WaveVector_IdentitySwap(Boxsize Box, int3 kmax, doubl
tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); Box.eik_y[ij + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); Box.eik_z[ij + 1 * numberOfAtoms] = tempcomplex;

}
__syncthreads();

if(ij == 0)
{
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x);
}
else if(ij == 1)
{
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y);
}
else if(ij == 2)
{
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z);
Initialize_Vectors_thread(Box.eik_x, numberOfAtoms, kmax.x, ij);
Initialize_Vectors_thread(Box.eik_y, numberOfAtoms, kmax.y, ij);
Initialize_Vectors_thread(Box.eik_z, numberOfAtoms, kmax.z, ij);
}
}

Expand Down Expand Up @@ -890,20 +826,9 @@ __global__ void Setup_Wave_Vector_Ewald(Boxsize Box, Complex* eik_x, Complex* ei
tempcomplex.real = std::cos(s.y); tempcomplex.imag = std::sin(s.y); eik_y[i + 1 * numberOfAtoms] = tempcomplex;
tempcomplex.real = std::cos(s.z); tempcomplex.imag = std::sin(s.z); eik_z[i + 1 * numberOfAtoms] = tempcomplex;
// Calculate remaining positive kx, ky and kz by recurrence
for(size_t kx = 2; kx <= Box.kmax.x; ++kx)
{
eik_x[i + kx * numberOfAtoms] = multiply(eik_x[i + (kx - 1) * numberOfAtoms], eik_x[i + 1 * numberOfAtoms]);
}

for(size_t ky = 2; ky <= Box.kmax.y; ++ky)
{
eik_y[i + ky * numberOfAtoms] = multiply(eik_y[i + (ky - 1) * numberOfAtoms], eik_y[i + 1 * numberOfAtoms]);
}

for(size_t kz = 2; kz <= Box.kmax.z; ++kz)
{
eik_z[i + kz * numberOfAtoms] = multiply(eik_z[i + (kz - 1) * numberOfAtoms], eik_z[i + 1 * numberOfAtoms]);
}
Initialize_Vectors_thread(eik_x, numberOfAtoms, Box.kmax.x, i);
Initialize_Vectors_thread(eik_y, numberOfAtoms, Box.kmax.y, i);
Initialize_Vectors_thread(eik_z, numberOfAtoms, Box.kmax.z, i);
}
}

Expand Down

0 comments on commit 2afac95

Please sign in to comment.