From 093234afda556d8facdfd7d18e4b9207299dace3 Mon Sep 17 00:00:00 2001 From: OSUmageed Date: Tue, 2 May 2017 23:43:52 -0700 Subject: [PATCH] New version is ready. Moved the GPUchoice to a preprocessor directive (duh). Euler is commented well. Serves as the documentation for the entire package. --- README.md | 2 + SweptSource/Euler1D_SweptShared.cu | 147 +++++++++++++++++++---------- SweptSource/Heat1D_SweptShared.cu | 15 ++- SweptSource/KS1D_SweptShared.cu | 6 +- 4 files changed, 115 insertions(+), 55 deletions(-) diff --git a/README.md b/README.md index bc68c28..68e40bf 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,8 @@ Navigate to the folder, 'make' the executable and run the deviceQuery program. __OR__ if MATLAB is available, you can open matlab and type gpuDevice in the command line. You can also see the number of GPUs in your environment with gpuDeviceCount and feed the index to gpuDevice to query individual GPUs. + If using a GPU numbered other than 0: change the GPUNUM preprocessor variable in each source code file. + 3. Open the Makefile in the SweptSource folder and change the compute_ and sm_ numbers in CUDAFLAGS to your compute capability. 4. Make sure the nvcc compiler is on your shell path. On Linux it should be in: /usr/local/cuda/bin diff --git a/SweptSource/Euler1D_SweptShared.cu b/SweptSource/Euler1D_SweptShared.cu index 9896488..f036386 100644 --- a/SweptSource/Euler1D_SweptShared.cu +++ b/SweptSource/Euler1D_SweptShared.cu @@ -11,7 +11,7 @@ The problem may be evaluated in three ways: Classic, SharedGPU, and Hybrid. Classic simply steps forward in time and calls the kernel once every timestep (predictor step or full step). SharedGPU uses the GPU for all computation and applies the swept rule. Hybrid applies the swept rule but computes the node on the boundary with the CPU. */ -/* +/** Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND Daniel Magee, mageed@oregonstate.edu */ @@ -38,9 +38,10 @@ #include #include +// This file uses vector types to hold the dependent variables so fundamental operations on those types are defined as macros to accommodate different data types. Also, keeping types consistent for common constants (0, 1, 2, etc) used in computation has an appreciable positive effect on performance. +#define GPUNUM 0 -// This file uses vector types to hold the dependent variables so fundamental operations on those types are defined as macros to accommodate different data types. Also, keeping types consistent for common constants (0, 1, 2, etc) used in computation has an appreciable positive effect on performance. #ifndef REAL #define REAL float #define REALtwo float2 @@ -438,9 +439,9 @@ upTriangle(const REALthree *IC, REALthree *outRight, REALthree *outLeft) extern __shared__ REALthree temper[]; int gid = blockDim.x * blockIdx.x + threadIdx.x; //Global Thread ID - int tididx = threadIdx.x + 2; //Block Thread ID - int tidxTop = tididx + dimens.base; // - int k=4; + int tididx = threadIdx.x + 2; //Thread's lane in the node [2, blockDim.x+1] + int tidxTop = tididx + dimens.base; //Thread's second row lane. + int k=4; //Start k at 4 since the base and second step occur before loop. //Assign the initial values to the first row in temper, each block //has it's own version of temper shared among its threads. @@ -448,6 +449,7 @@ upTriangle(const REALthree *IC, REALthree *outRight, REALthree *outLeft) __syncthreads(); + //First step gets predictor values for lanes excluding outer two on each side. if (threadIdx.x > 1 && threadIdx.x <(blockDim.x-2)) { temper[tidxTop] = eulerStutterStep(temper, tididx, false, false); @@ -455,7 +457,7 @@ upTriangle(const REALthree *IC, REALthree *outRight, REALthree *outLeft) __syncthreads(); - //The initial conditions are timslice 0 so start k at 1. + //Step through solution excluding two more lanes on each side each step. while (k < (blockDim.x>>1)) { if (threadIdx.x < (blockDim.x-k) && threadIdx.x >= k) @@ -497,10 +499,13 @@ downTriangle(REALthree *IC, const REALthree *inRight, const REALthree *inLeft) int gid = blockDim.x * blockIdx.x + threadIdx.x; int tididx = threadIdx.x + 2; int tidxTop = tididx + dimens.base; - int k = dimens.hts[2]; + //k starts at blockDim.x/2 and shrinks from there as the lane width grows. + int k = dimens.hts[2]; readIn(temper, inRight, inLeft, threadIdx.x, gid); + //Masks edges (whole domain edges not nodal edges) on last timestep. + //Stored in one register per thread. const char4 truth = {gid == 0, gid == 1, gid == dimens.idxend_1, gid == dimens.idxend}; __syncthreads(); @@ -532,13 +537,14 @@ downTriangle(REALthree *IC, const REALthree *inRight, const REALthree *inLeft) /** Builds an diamond using the swept rule after a left pass. - Unsplit diamond using the swept rule. wholeDiamond must apply boundary conditions only at it's center. + Unsplit diamond using the swept rule. + wholeDiamond must apply boundary conditions only at edges at only on it's center timestep. @param inRight Array of right edges seeding solution vector. @param inLeft Array of left edges seeding solution vector. @param outRight Array to store the right sides of the triangles to be passed. @param outLeft Array to store the left sides of the triangles to be passed. - @param Full True if there is not a node run on the CPU, false otherwise. + @param split True if there is a node run on the CPU, false otherwise. */ __global__ void @@ -551,8 +557,12 @@ wholeDiamond(const REALthree *inRight, const REALthree *inLeft, REALthree *outRi int tididx = threadIdx.x + 2; int tidxTop = tididx + dimens.base; + //Masks edges in the same way as downTriangle char4 truth = {gid == 0, gid == 1, gid == dimens.idxend_1, gid == dimens.idxend}; + //If this is a hybrid scheme and this kernel is run alongside a cpu function: + //Add a block size to the global id (to pick and place the correct values from the global array) + //And reset the truth values since this is not going to have any edges. if (split) { gid += blockDim.x; @@ -563,6 +573,8 @@ wholeDiamond(const REALthree *inRight, const REALthree *inLeft, REALthree *outRi __syncthreads(); + //k starts behind the downTriangle k because we need to do the first timestep outside the loop + //to get the order right. int k = dimens.hts[0]; if (tididx < (dimens.base-dimens.hts[2]) && tididx >= dimens.hts[2]) @@ -627,6 +639,7 @@ wholeDiamond(const REALthree *inRight, const REALthree *inLeft, REALthree *outRi __syncthreads(); } + //Split is the hybrid version of splitDiamond so write out left. Otherwise right. if (split) { writeOutLeft(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x); @@ -638,25 +651,35 @@ wholeDiamond(const REALthree *inRight, const REALthree *inLeft, REALthree *outRi } -//Split one is always first. +/** + Builds an diamond using the swept rule after a right pass. + + Diamond with node 0 split across the boundary at it's center. + + @param inRight Array of right edges seeding solution vector. + @param inLeft Array of left edges seeding solution vector. + @param outRight Array to store the right sides of the triangles to be passed. + @param outLeft Array to store the left sides of the triangles to be passed. +*/ __global__ void splitDiamond(REALthree *inRight, REALthree *inLeft, REALthree *outRight, REALthree *outLeft) { extern __shared__ REALthree temper[]; - //Same as upTriangle int gid = blockDim.x * blockIdx.x + threadIdx.x; int tididx = threadIdx.x + 2; int tidxTop = tididx + dimens.base; - int k = dimens.hts[2]; + int k = dimens.hts[2]; //Starts more like downTriangle readIn(temper, inRight, inLeft, threadIdx.x, gid); + //The edges are now in the center of the node 0 which is easy to find using the global id. const char4 truth = {gid == dimens.hts[0], gid == dimens.hts[1], gid == dimens.hts[2], gid == dimens.hts[3]}; __syncthreads(); + //Still need to set the boundary values first because they aren't necessarily preserved in the global arrays. if (truth.z) { temper[tididx] = dbd[0]; @@ -672,7 +695,6 @@ splitDiamond(REALthree *inRight, REALthree *inLeft, REALthree *outRight, REALthr while(k>0) { - if (!truth.y && !truth.z && tididx < (dimens.base-k) && tididx >= k) { temper[tidxTop] = eulerStutterStep(temper, tididx, truth.w, truth.x); @@ -698,7 +720,6 @@ splitDiamond(REALthree *inRight, REALthree *inLeft, REALthree *outRight, REALthr __syncthreads(); k=4; - //The initial conditions are timslice 0 so start k at 1. while(k= k) @@ -722,9 +743,10 @@ splitDiamond(REALthree *inRight, REALthree *inLeft, REALthree *outRight, REALthr writeOutLeft(temper, outRight, outLeft, threadIdx.x, gid, blockDim.x); } - +//Now we can set the namespace. using namespace std; +//Get energy out from conserved variables for plotting. __host__ __forceinline__ REAL @@ -734,22 +756,30 @@ energy(REALthree subj) return subj.z/subj.x - HALF*u*u; } +/** + Do a node on the CPU. The diamond split along the boundary. + + @param temper The working array. + @param htcpu The index for masking. +*/ + __host__ void CPU_diamond(REALthree *temper, int htcpu[5]) { - omp_set_num_threads(8); + //Assert the boundary conditions. temper[htcpu[2]] = bd[0]; temper[htcpu[2]+dimz.base] = bd[0]; temper[htcpu[1]] = bd[1]; temper[htcpu[1]+dimz.base] = bd[1]; - //Splitting it is the whole point! + //Continue stepping forward and expanding the mask until the loop stops. for (int k = htcpu[0]; k>0; k-=4) { + //Parallelize the loop over the unmasked indices. #pragma omp parallel for for(int n = k; n<(dimz.base-k); n++) { @@ -769,6 +799,7 @@ CPU_diamond(REALthree *temper, int htcpu[5]) } } + //Do the central loop that covers the entire nodal domain. #pragma omp parallel for for(int n = 4; n < (dimz.base-4); n++) { @@ -778,7 +809,7 @@ CPU_diamond(REALthree *temper, int htcpu[5]) } } - //Top part. + //And do the top part. Shrink the mask until the loop ends. for (int k = 6; k>> (dEuler_in, dEuler_out, false); classicEuler <<< bks,tpb >>> (dEuler_out, dEuler_in, true); t_eq += dt; + //If multiple timesteps should be written out do so here with the file provided. if (t_eq > twrite) { cudaMemcpy(T_f, dEuler_in, sizeof(REALthree)*dv, cudaMemcpyDeviceToHost); @@ -858,30 +894,34 @@ classicWrapper(const int bks, int tpb, const int dv, const double dt, const doub } -//The wrapper that calls the routine functions. +//The wrapper that enacts the swept rule. double sweptWrapper(const int bks, int tpb, const int dv, const double dt, const double t_end, const int cpu, REALthree *IC, REALthree *T_f, const double freq, ofstream &fwr) { - const size_t smem = (2*dimz.base)*sizeof(REALthree); - const int cpuLoc = dv-tpb; + const size_t smem = (2*dimz.base)*sizeof(REALthree); //Amt of shared memory to request + const int cpuLoc = dv-tpb; //Where to write the cpu values back to device memory to make swap if hybrid. + // CPU mask values for boundary. int htcpu[5]; for (int k=0; k<5; k++) htcpu[k] = dimz.hts[k]+2; REALthree *d_IC, *d0_right, *d0_left, *d2_right, *d2_left; + // Allocate device global memory cudaMalloc((void **)&d_IC, sizeof(REALthree)*dv); cudaMalloc((void **)&d0_right, sizeof(REALthree)*dv); cudaMalloc((void **)&d0_left, sizeof(REALthree)*dv); cudaMalloc((void **)&d2_right, sizeof(REALthree)*dv); cudaMalloc((void **)&d2_left, sizeof(REALthree)*dv); + // Transfer over the initial conditions. cudaMemcpy(d_IC,IC,sizeof(REALthree)*dv,cudaMemcpyHostToDevice); - // Start the counter and start the clock. + // Start the simulation time counter and start the clock. const double t_fullstep = 0.25*dt*(double)tpb; + //Call up first out of loop with right and left 0 upTriangle <<>> (d_IC, d0_right, d0_left); double t_eq; @@ -889,8 +929,10 @@ sweptWrapper(const int bks, int tpb, const int dv, const double dt, const double // Call the kernels until you reach the final time + // The hybrid version. if (cpu) { + // Tell us what it is and allocate the host arrays. cudaHostAlloc is pinned memory that transfers faster. cout << "Hybrid Swept scheme" << endl; REALthree *h_right, *h_left; @@ -898,34 +940,39 @@ sweptWrapper(const int bks, int tpb, const int dv, const double dt, const double cudaHostAlloc((void **) &h_right, tpb*sizeof(REALthree), cudaHostAllocDefault); cudaHostAlloc((void **) &h_left, tpb*sizeof(REALthree), cudaHostAllocDefault); - t_eq = t_fullstep; + t_eq = t_fullstep; + // Start 3 cuda streams to overlap memory transfer and kernel launch with cpu computation. cudaStream_t st1, st2, st3; cudaStreamCreate(&st1); cudaStreamCreate(&st2); cudaStreamCreate(&st3); - //Split Diamond Begin------ + // Split Diamond Begin------ + // Call wholeDiamond on first non-default stream and launch with one missing block. wholeDiamond <<>> (d0_right, d0_left, d2_right, d2_left, true); - + // Simultaneously transfer the right and left values for node1 to the CPU. cudaMemcpyAsync(h_left, d0_left, tpb*sizeof(REALthree), cudaMemcpyDeviceToHost, st2); cudaMemcpyAsync(h_right, d0_right, tpb*sizeof(REALthree), cudaMemcpyDeviceToHost, st3); + // Wait for memory to arrive. and read the edges into the working array: tmpr. cudaStreamSynchronize(st2); cudaStreamSynchronize(st3); // CPU Part Start ----- for (int k=0; k>> (d2_right, d2_left, d0_right, d0_left, false); splitDiamond <<>> (d0_right, d0_left, d2_right, d2_left); - //So it always ends on a left pass since the down triangle is a right pass. + //It always ends on a left pass since the down triangle is a right pass. t_eq += t_fullstep; if (t_eq > twrite) @@ -1048,7 +1094,7 @@ sweptWrapper(const int bks, int tpb, const int dv, const double dt, const double } } } - + // The last call is down so call it and pass the relevant data to the host with memcpy. downTriangle <<>> (d_IC, d2_right, d2_left); cudaMemcpy(T_f, d_IC, sizeof(REALthree)*dv, cudaMemcpyDeviceToHost); @@ -1064,17 +1110,17 @@ sweptWrapper(const int bks, int tpb, const int dv, const double dt, const double int main( int argc, char *argv[] ) { - //That is, there are less than 8 arguments. + // That is, there must be 8 or 9 argumets arguments. if (argc < 8) { cout << "The Program takes 8 inputs, #Divisions, #Threads/block, deltat, finish time, output frequency..." << endl; cout << "Algorithm type, Variable Output File, Timing Output File (optional)" << endl; exit(-1); } - cout.precision(10); + cout.precision(10); // Choose the GPGPU. This is device 0 in my machine which has 2 devices. - cudaSetDevice(0); + cudaSetDevice(GPUNUM); if (sizeof(REAL)>6) cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); dimz.gam = 1.4; @@ -1087,30 +1133,29 @@ int main( int argc, char *argv[] ) bd[0].z = ONE/dimz.mgam; //Energy bd[1].z = 0.1/dimz.mgam; - const int dv = atoi(argv[1]); //Number of spatial points const int tpb = atoi(argv[2]); //Threads per Block - const double dt = atof(argv[3]); + const double dt = atof(argv[3]); //Timestep const double tf = atof(argv[4]) - QUARTER*dt; //Finish time - const double freq = atof(argv[5]); + const double freq = atof(argv[5]); //Frequency of output (i.e. every 20 s (simulation time)) const int scheme = atoi(argv[6]); //2 for Alternate, 1 for GPUShared, 0 for Classic const int bks = dv/tpb; //The number of blocks - const double dx = lx/((REAL)dv-TWO); + const double dx = lx/((REAL)dv-TWO); //Grid size. char const *prec; prec = (sizeof(REAL)<6) ? "Single": "Double"; //Declare the dimensions in constant memory. dimz.dt_dx = dt/dx; // dt/dx - dimz.base = tpb+4; - dimz.idxend = dv-1; - dimz.idxend_1 = dv-2; + dimz.base = tpb+4; // Length of the base of a node. + dimz.idxend = dv-1; // Index of last spatial point. + dimz.idxend_1 = dv-2; // 2nd to last spatial point. - for (int k=-2; k<3; k++) dimz.hts[k+2] = (tpb/2) + k; + for (int k=-2; k<3; k++) dimz.hts[k+2] = (tpb/2) + k; //Middle values in the node (masking values) cout << "Euler --- #Blocks: " << bks << " | Length: " << lx << " | Precision: " << prec << " | dt/dx: " << dimz.dt_dx << endl; - //Conditions for main input. - //dv and tpb must be powers of two. dv must be larger than tpb and divisible by tpb. + // Conditions for main input. + // dv and tpb must be powers of two. dv must be larger than tpb and divisible by tpb. if ((dv & (tpb-1) !=0) || (tpb&31) != 0) { @@ -1127,10 +1172,10 @@ int main( int argc, char *argv[] ) // Initialize arrays. REALthree *IC, *T_final; - cudaHostAlloc((void **) &IC, dv*sizeof(REALthree), cudaHostAllocDefault); - cudaHostAlloc((void **) &T_final, dv*sizeof(REALthree), cudaHostAllocDefault); + cudaHostAlloc((void **) &IC, dv*sizeof(REALthree), cudaHostAllocDefault); // Initial conditions + cudaHostAlloc((void **) &T_final, dv*sizeof(REALthree), cudaHostAllocDefault); // Final values - for (int k = 0; k7) { ofstream ftime; diff --git a/SweptSource/Heat1D_SweptShared.cu b/SweptSource/Heat1D_SweptShared.cu index ee00dbb..b598619 100644 --- a/SweptSource/Heat1D_SweptShared.cu +++ b/SweptSource/Heat1D_SweptShared.cu @@ -1,4 +1,4 @@ -/* +/** Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND Daniel Magee, mageed@oregonstate.edu */ @@ -23,6 +23,8 @@ #include #include +#define GPUNUM 0 + #ifndef REAL #define REAL float #define HALF 0.5f @@ -701,7 +703,7 @@ int main(int argc, char *argv[]) cout.precision(10); // Choose the GPGPU. This is device 0 in my machine which has 2 devices. - cudaSetDevice(0); + cudaSetDevice(GPUNUM); if (sizeof(REAL)>6) cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); int dv = atoi(argv[1]); //Number of spatial points @@ -788,6 +790,14 @@ int main(int argc, char *argv[]) cudaEventSynchronize(stop); cudaEventElapsedTime( &timed, start, stop); + cudaError_t error = cudaGetLastError(); + if(error != cudaSuccess) + { + // print the CUDA error message and exit + printf("CUDA error: %s\n", cudaGetErrorString(error)); + exit(-1); + } + timed *= 1.e3; double n_timesteps = tfm/dt; @@ -811,6 +821,7 @@ int main(int argc, char *argv[]) fwr.close(); // Free the memory and reset the device. + cudaDeviceSynchronize(); cudaEventDestroy( start ); cudaEventDestroy( stop ); diff --git a/SweptSource/KS1D_SweptShared.cu b/SweptSource/KS1D_SweptShared.cu index d2990fc..a432247 100644 --- a/SweptSource/KS1D_SweptShared.cu +++ b/SweptSource/KS1D_SweptShared.cu @@ -1,4 +1,4 @@ -/* +/** Copyright (C) 2017 Kyle Niemeyer, niemeyek@oregonstate.edu AND Daniel Magee, mageed@oregonstate.edu */ @@ -23,6 +23,8 @@ #include #include +#define GPUNUM 0 + #ifndef REAL #define REAL float #define ONE 1.f @@ -870,7 +872,7 @@ int main( int argc, char *argv[]) cout.precision(10); // Choose the GPGPU. This is device 0 in my machine which has 2 devices. - cudaSetDevice(0); + cudaSetDevice(GPUNUM); if (sizeof(REAL)>6) cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); const int dv = atoi(argv[1]); //Number of spatial points