CDP completely removed from simulation code + activate use-fast-math …

…compiler flag
chrxh · Jan 10, 2022 · 4bb2783 · 4bb2783
1 parent 082bc30
commit 4bb2783
Show file tree

Hide file tree

Showing 5 changed files with 15 additions and 38 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -19,7 +19,7 @@ set(CMAKE_CXX_STANDARD 17)
 
 set(CMAKE_CUDA_SEPARABLE_COMPILATION ON)
 
-set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -lineinfo --use-local-env")
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -g -lineinfo --use-local-env -use_fast_math")
 
 project(alien-project LANGUAGES C CXX CUDA)
 

diff --git a/source/EngineGpuKernels/DataAccessKernels.cu b/source/EngineGpuKernels/DataAccessKernels.cu
@@ -74,12 +74,12 @@ namespace
         particleTO.vel = particle->vel;
         particleTO.energy = particle->energy;
     }
+
 }
 
 /************************************************************************/
 /* Main                                                                 */
 /************************************************************************/
-
 __global__ void cudaGetSelectedCellDataWithoutConnections(SimulationData data, bool includeClusters, DataAccessTO dataTO)
 {
     auto const& cells = data.entities.cellPointers;
@@ -338,24 +338,6 @@ __global__ void cudaClearDataTO(DataAccessTO dataTO)
     *dataTO.numStringBytes = 0;
 }
 
-__global__ void cudaSaveNumEntries(SimulationData data)
-{
-    data.entities.saveNumEntries();
-}
-
-__global__ void cudaGetSelectedSimulationData(SimulationData data, bool includeClusters, DataAccessTO dataTO)
-{
-    *dataTO.numCells = 0;
-    *dataTO.numParticles = 0;
-    *dataTO.numTokens = 0;
-    *dataTO.numStringBytes = 0;
-
-    DEPRECATED_KERNEL_CALL_SYNC(cudaGetSelectedCellDataWithoutConnections, data, includeClusters, dataTO);
-    DEPRECATED_KERNEL_CALL_SYNC(cudaResolveConnections, data, dataTO);
-    DEPRECATED_KERNEL_CALL_SYNC(cudaGetTokenData, data, dataTO);
-    DEPRECATED_KERNEL_CALL_SYNC(cudaGetSelectedParticleData, data, dataTO);
-}
-
 __global__ void cudaClearData(SimulationData data)
 {
     data.entities.cellPointers.reset();
@@ -366,3 +348,8 @@ __global__ void cudaClearData(SimulationData data)
     data.entities.particles.reset();
     data.entities.dynamicMemory.reset();
 }
+
+__global__ void cudaSaveNumEntries(SimulationData data)
+{
+    data.entities.saveNumEntries();
+}
diff --git a/source/EngineGpuKernels/DataAccessKernels.cuh b/source/EngineGpuKernels/DataAccessKernels.cuh
@@ -27,5 +27,4 @@ __global__ void cudaCreateDataFromTO(SimulationData data, DataAccessTO dataTO, b
 __global__ void cudaAdaptNumberGenerator(CudaNumberGenerator numberGen, DataAccessTO dataTO);
 __global__ void cudaClearDataTO(DataAccessTO dataTO);
 __global__ void cudaSaveNumEntries(SimulationData data);
-__global__ void cudaGetInspectedSimulationData(SimulationData data, InspectedEntityIds entityIds, DataAccessTO dataTO);
 __global__ void cudaClearData(SimulationData data);
diff --git a/source/EngineGpuKernels/Macros.cuh b/source/EngineGpuKernels/Macros.cuh
@@ -11,23 +11,6 @@
 #define KERNEL_CALL(func, ...) func<<<gpuSettings.NUM_BLOCKS, gpuSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__);
 #define KERNEL_CALL_1_1(func, ...) func<<<1, 1>>>(__VA_ARGS__);
 
-//#TODO remove following macros
-#define DEPRECATED_KERNEL_CALL(func, ...) func<<<cudaThreadSettings.NUM_BLOCKS, cudaThreadSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__);
-#define DEPRECATED_KERNEL_CALL_1_1(func, ...) func<<<1, 1>>>(__VA_ARGS__);
-
-#define DEPRECATED_KERNEL_CALL_HOST_SYNC(func, ...) \
-    func<<<1, 1>>>(__VA_ARGS__); \
-    cudaDeviceSynchronize(); \
-    CHECK_FOR_CUDA_ERROR(cudaGetLastError());
-
-#define DEPRECATED_KERNEL_CALL_SYNC(func, ...)  \
-        func<<<cudaThreadSettings.NUM_BLOCKS, cudaThreadSettings.NUM_THREADS_PER_BLOCK>>>(__VA_ARGS__); \
-        cudaDeviceSynchronize();
-
-#define DEPRECATED_KERNEL_CALL_SYNC_1_1(func, ...)  \
-        func<<<1, 1>>>(__VA_ARGS__); \
-        cudaDeviceSynchronize();
-
 template< typename T >
 void checkAndThrowError(T result, char const *const func, const char *const file, int const line)
 {

diff --git a/source/EngineGpuKernels/SimulationKernels.cu b/source/EngineGpuKernels/SimulationKernels.cu
@@ -104,3 +104,11 @@ __global__ void processingStep13(SimulationData data)
     TokenProcessor tokenProcessor;
     tokenProcessor.deleteTokenIfCellDeleted(data);
 }
+
+//This is the only kernel that uses dynamic parallelism.
+//When it is removed, performance drops by about 20% for unknown reasons.
+__global__ void nestedDummy() {}
+__global__ void dummy()
+{
+    nestedDummy<<<1, 1>>>();
+}