Skip to content

Commit

Permalink
CDP completely removed from simulation code + activate use-fast-math …
Browse files Browse the repository at this point in the history
…compiler flag
  • Loading branch information
chrxh committed Jan 10, 2022
1 parent 082bc30 commit 4bb2783
Show file tree
Hide file tree
Showing 5 changed files with 15 additions and 38 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 17)

set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -lineinfo --use-local-env")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -lineinfo --use-local-env -use_fast_math")

project(alien-project LANGUAGES C CXX CUDA)

Expand Down
25 changes: 6 additions & 19 deletions source/EngineGpuKernels/DataAccessKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ namespace
particleTO.vel = particle->vel;
particleTO.energy = particle->energy;
}

}

/************************************************************************/
/* Main */
/************************************************************************/

__global__ void cudaGetSelectedCellDataWithoutConnections(SimulationData data, bool includeClusters, DataAccessTO dataTO)
{
auto const& cells = data.entities.cellPointers;
Expand Down Expand Up @@ -338,24 +338,6 @@ __global__ void cudaClearDataTO(DataAccessTO dataTO)
*dataTO.numStringBytes = 0;
}

__global__ void cudaSaveNumEntries(SimulationData data)
{
data.entities.saveNumEntries();
}

__global__ void cudaGetSelectedSimulationData(SimulationData data, bool includeClusters, DataAccessTO dataTO)
{
*dataTO.numCells = 0;
*dataTO.numParticles = 0;
*dataTO.numTokens = 0;
*dataTO.numStringBytes = 0;

DEPRECATED_KERNEL_CALL_SYNC(cudaGetSelectedCellDataWithoutConnections, data, includeClusters, dataTO);
DEPRECATED_KERNEL_CALL_SYNC(cudaResolveConnections, data, dataTO);
DEPRECATED_KERNEL_CALL_SYNC(cudaGetTokenData, data, dataTO);
DEPRECATED_KERNEL_CALL_SYNC(cudaGetSelectedParticleData, data, dataTO);
}

__global__ void cudaClearData(SimulationData data)
{
data.entities.cellPointers.reset();
Expand All @@ -366,3 +348,8 @@ __global__ void cudaClearData(SimulationData data)
data.entities.particles.reset();
data.entities.dynamicMemory.reset();
}

__global__ void cudaSaveNumEntries(SimulationData data)
{
data.entities.saveNumEntries();
}
1 change: 0 additions & 1 deletion source/EngineGpuKernels/DataAccessKernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,5 +27,4 @@ __global__ void cudaCreateDataFromTO(SimulationData data, DataAccessTO dataTO, b
__global__ void cudaAdaptNumberGenerator(CudaNumberGenerator numberGen, DataAccessTO dataTO);
__global__ void cudaClearDataTO(DataAccessTO dataTO);
__global__ void cudaSaveNumEntries(SimulationData data);
__global__ void cudaGetInspectedSimulationData(SimulationData data, InspectedEntityIds entityIds, DataAccessTO dataTO);
__global__ void cudaClearData(SimulationData data);
17 changes: 0 additions & 17 deletions source/EngineGpuKernels/Macros.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -11,23 +11,6 @@
#define KERNEL_CALL(func, ...) func<<<gpuSettings.NUM_BLOCKS, gpuSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__);
#define KERNEL_CALL_1_1(func, ...) func<<<1, 1>>>(__VA_ARGS__);

//#TODO remove following macros
#define DEPRECATED_KERNEL_CALL(func, ...) func<<<cudaThreadSettings.NUM_BLOCKS, cudaThreadSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__);
#define DEPRECATED_KERNEL_CALL_1_1(func, ...) func<<<1, 1>>>(__VA_ARGS__);

#define DEPRECATED_KERNEL_CALL_HOST_SYNC(func, ...) \
func<<<1, 1>>>(__VA_ARGS__); \
cudaDeviceSynchronize(); \
CHECK_FOR_CUDA_ERROR(cudaGetLastError());

#define DEPRECATED_KERNEL_CALL_SYNC(func, ...) \
func<<<cudaThreadSettings.NUM_BLOCKS, cudaThreadSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__); \
cudaDeviceSynchronize();

#define DEPRECATED_KERNEL_CALL_SYNC_1_1(func, ...) \
func<<<1, 1>>>(__VA_ARGS__); \
cudaDeviceSynchronize();

template< typename T >
void checkAndThrowError(T result, char const *const func, const char *const file, int const line)
{
Expand Down
8 changes: 8 additions & 0 deletions source/EngineGpuKernels/SimulationKernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -104,3 +104,11 @@ __global__ void processingStep13(SimulationData data)
TokenProcessor tokenProcessor;
tokenProcessor.deleteTokenIfCellDeleted(data);
}

//This is the only kernel that uses dynamic parallelism.
//When it is removed, performance drops by about 20% for unknown reasons.
__global__ void nestedDummy() {}
__global__ void dummy()
{
nestedDummy<<<1, 1>>>();
}

0 comments on commit 4bb2783

Please sign in to comment.