Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cmake conditions similar to main branch #16

Open
wants to merge 28 commits into
base: group_sim_combined
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
366b0d5
hip for nvidia backend compiles
atif4461 May 29, 2024
8379d6c
perlmutter
atif4461 May 30, 2024
ce46743
uncommented Rand4Hits_finish
atif4461 May 30, 2024
396c780
uncommented load_hit_sim and simulate
atif4461 May 30, 2024
8cb14b2
conditional compilation for RNDCPU_GEN
atif4461 Jun 11, 2024
28cfd3e
added build instruction for lambda2 HIP AMD backend
atif4461 Jun 11, 2024
3790649
Merge pull request #2 from atif4461/group_sim_combined_hip_nv
atif4461 Jun 11, 2024
63927bd
added checks for envs and rngs
atif4461 Jun 26, 2024
a4591f0
cleaned CMakeLists
atif4461 Jun 26, 2024
56caafc
cleaned CMakeLists
atif4461 Jun 26, 2024
94ba113
removed extra hip_runtime include
atif4461 Jul 1, 2024
827c4a1
first omp-portable-rng apis
atif4461 Jul 1, 2024
ef7afae
added readme
atif4461 Jul 1, 2024
bb9982d
fixed runtime errors
atif4461 Jul 2, 2024
d07f92d
added checks for hip for nv
atif4461 Jul 18, 2024
936f4e2
commented gpuq for openmp amd bug
atif4461 Jul 27, 2024
4feccf6
amd working state
atif4461 Jul 27, 2024
3c5ac37
added parser for openmp offload-arch
Oct 2, 2024
3d34164
added openmp multicore cpu
Oct 2, 2024
0219766
added script, updated readme
Oct 2, 2024
82a1fce
fixed random123 issues
atif4461 Oct 10, 2024
9802a97
updated with changes from group_sim_combined
atif4461 Oct 10, 2024
a47ae27
fixed remaining conflicts
atif4461 Oct 10, 2024
3f97c17
added rocrand for openmp on amd devices
atif4461 Oct 17, 2024
61eb416
added random 123 build scripts
atif4461 Oct 21, 2024
426850d
merged portable_rng, tested exalearn 4,5
atif4461 Oct 21, 2024
6653e21
fixed omp rng bugs
atif4461 Oct 21, 2024
cf986a7
added dockerfile llvm amdgpu
atif4461 Dec 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions FastCaloSimAnalyzer/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ set(USE_ALPAKA OFF CACHE BOOL "Use alpaka")
set(USE_HIP OFF CACHE BOOL "Use HIP")
set(HIP_TARGET "AMD" CACHE STRING "HIP backend. must be either AMD or NVIDIA")


if ( USE_STDPAR )
if ( ${STDPAR_TARGET} STREQUAL "cpu" )
if ( NOT RNDGEN_CPU )
Expand All @@ -46,11 +45,13 @@ elseif( USE_KOKKOS )
elseif(USE_ALPAKA)
find_package(alpaka REQUIRED)
elseif(USE_HIP)
find_package(HIP REQUIRED)
if ( ${HIP_TARGET} STREQUAL "NVIDIA" )
find_package(HIP)
if ( NOT RNDGEN_CPU )
message(FATAL_ERROR "when HIP_TARGET=NVIDIA, RNDGEN_CPU must be ON")
endif()
else()
find_package(HIP REQUIRED)
endif()
endif()

Expand Down
1 change: 1 addition & 0 deletions FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/Rand4Hits.h
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,7 @@ class Rand4Hits {
unsigned int m_current_hits;
void *m_gen{ nullptr };
bool m_useCPU{ false };
unsigned long long m_seed{0};

// patch in some GPU pointers for cudaMalloc
CELL_ENE_T *m_cells_energy{ 0 };
Expand Down
126 changes: 99 additions & 27 deletions FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -20,37 +20,52 @@ endif()
if(USE_HIP)
set(FIND_CUDA OFF)
endif()


if(ENABLE_OMPGPU)
string(STRIP ${CMAKE_CXX_FLAGS} OMP_OFFLOAD_TARGET)
string(FIND ${CMAKE_CXX_FLAGS} "gfx" OMP_OFFLOAD_TARGET_AMD)
string(FIND ${CMAKE_CXX_FLAGS} "sm_" OMP_OFFLOAD_TARGET_NVIDIA)
if(OMP_OFFLOAD_TARGET_NVIDIA GREATER 0)
message(STATUS "OMP_OFFLOAD_TARGET NVIDIA" )
elseif(OMP_OFFLOAD_TARGET_AMD GREATER 0)
set(FIND_CUDA OFF)
message(STATUS "OMP_OFFLOAD_TARGET AMD" )
else()
if($ENV{OMP_TARGET_OFFLOAD} MATCHES "disabled")
set(FIND_CUDA OFF)
else()
message(FATAL_ERROR "!! Please specify OpenMP offload target via -DCMAKE_CXX_FLAGS=\"--offload-arch=gfx<>|sm_<>\" or set environment var OMP_TARGET_OFFLOAD=disabled")
endif()
endif()
endif()

if(FIND_CUDA)
find_package(CUDA REQUIRED)
enable_language( CUDA )
set(CUDA_LIBRARIES PUBLIC ${CUDA_LIBRARIES})
endif()

# Add OpenMP
if(ENABLE_OMPGPU)
find_package(OpenMP)
if(OPENMP_FOUND)
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-cuda-mode")
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto")
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state")
set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " )
set(OpenMP_FLAGS "-fopenmp --offload-arch=sm_86 -lomp") ## nvidia
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}")
else()
message(WARNING "Configuring with OpenMP GPU but OpenMP is not found!")
endif()
endif()

# Sources

if(USE_STDPAR)
set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_sp.cxx gpuQ.cu Rand4Hits.cu Rand4Hits_sp.cxx )
elseif(USE_KOKKOS)
set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_kk.cxx DEV_BigMem_kk.cxx)
elseif(ENABLE_OMPGPU)
# Add OpenMP
find_package(OpenMP)
if(OPENMP_FOUND)
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-cuda-mode")
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto")
set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state")
set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " )
set(OpenMP_FLAGS "-fopenmp -lomp -lomptarget")
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}")
else()
message(WARNING "Configuring with OpenMP GPU but OpenMP is not found!")
endif()
set(FastCaloGpu_Srcs KernelWrapper_omp.cxx gpuQ.cxx CaloGpuGeneral.cxx DEV_BigMem_omp.cxx )
elseif(USE_ALPAKA)
set(FastCaloGpu_Srcs CaloGpuGeneral.cxx KernelWrapper_al.cxx Rand4Hits_al.cxx )
Expand All @@ -66,10 +81,10 @@ elseif(USE_HIP)
include_directories( ${ROCM_PATH}/hip/include )

if ( ${HIP_TARGET} STREQUAL "AMD" )
message(STATUS " Using AMD HIP backend")
message(STATUS "Using AMD HIP backend")
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908")
elseif( ${HIP_TARGET} STREQUAL "NVIDIA" )
message(STATUS " Using NVIDIA HIP backend")
message(STATUS "Using NVIDIA HIP backend")
find_package(CUDAToolkit REQUIRED)
set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86")
set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908")
Expand Down Expand Up @@ -120,7 +135,20 @@ elseif(USE_HIP)
target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA)
target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart)
endif()

elseif(ENABLE_OMPGPU)
if(OMP_OFFLOAD_TARGET_NVIDIA GREATER 0)
target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DOMP_OFFLOAD_TARGET_NVIDIA)
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY})
endif()
if(OMP_OFFLOAD_TARGET_AMD GREATER 0)
target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -D__HIP_PLATFORM_AMD__)
target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DOMP_OFFLOAD_TARGET_AMD)
find_package(HIP REQUIRED)
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include )
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/include )
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/librocrand.so)
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libamdhip64.so)
endif()
else()
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY})
endif()
Expand Down Expand Up @@ -151,14 +179,58 @@ endif()
if(RNDGEN_CPU)
message(STATUS "Will generate random numbers on CPU")
target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU )
# TODO Link a portable RNG library
else()
if ( ${HIP_TARGET} STREQUAL "AMD" )
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include )
elseif(RNDGEN_OMP)
message(STATUS "Will generate random numbers using Portable OpenMP RNG Library")
target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_OMP )
if(NOT DEFINED OMPRNG_HOME)
include(FetchContent)
set(FETCHCONTENT_QUIET OFF)
FetchContent_Declare(
Portable-OpenMP-RNG
GIT_REPOSITORY https://github.com/GKNB/test-benchmark-OpenMP-RNG
GIT_TAG origin/main
)
FetchContent_Populate(Portable-OpenMP-RNG)
include_directories(${CMAKE_BINARY_DIR}/_deps/portable-openmp-rng-src)
else()
include_directories(${OMPRNG_HOME})
include_directories(${OMPRNG_HOME}/implementation)
endif()
if(ARCH_CUDA)
target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DARCH_CUDA )
target_link_libraries(${FastCaloGpu_LIB} PRIVATE ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY})
elseif(ARCH_HIP)
target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DARCH_HIP )
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include )
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so)
elseif( ${HIP_TARGET} STREQUAL "NVIDIA" )
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY})
elseif(USE_RANDOM123)
if(NOT DEFINED RANDOM123_HOME)
include(FetchContent)
set(FETCHCONTENT_QUIET OFF)
FetchContent_Declare(
random123
GIT_REPOSITORY https://github.com/DEShawResearch/random123
GIT_TAG origin/main
)
FetchContent_Populate(random123)
include_directories(${CMAKE_BINARY_DIR}/_deps/random123-src/include )
else()
include_directories(${RANDOM123_HOME}/include/)
endif()
target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DUSE_RANDOM123 )
endif()
else()
if(USE_HIP)
if ( ${HIP_TARGET} STREQUAL "AMD" )
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include )
target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include )
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so)
elseif( ${HIP_TARGET} STREQUAL "NVIDIA" )
target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY})
endif()
elseif(ENABLE_OMPGPU)
if(ENV{OMP_TARGET_OFFLOAD} MATCHES "disabled")
message(FATAL_ERROR "when OMP_TARGET_OFFLOAD disabled, RNDGEN_CPU must be ON")
endif()
endif()
endif()

Expand Down
4 changes: 0 additions & 4 deletions FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -8,16 +8,12 @@
#include "Hit.h"
#include "Rand4Hits.h"

#include "gpuQ.h"
#include "Args.h"
#include "DEV_BigMem.h"
// #include "OMP_BigMem.h"
#include <chrono>
#include <mutex>
#include <climits>

#include <cuda_runtime_api.h>
#include <curand.h>
#include <iostream>
#include <omp.h>

Expand Down
6 changes: 3 additions & 3 deletions FastCaloSimAnalyzer/FastCaloGpu/src/GeoRegion.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
#include <iostream>
#include <algorithm>

#define PI 3.14159265358979323846
#define PI_FCS 3.14159265358979323846
#define TWOPI 2 * 3.14159265358979323846

__HOSTDEV__ double Phi_mpi_pi(double x) {
while (x >= PI)
while (x >= PI_FCS)
x -= TWOPI;
while (x < -PI)
while (x < -PI_FCS)
x += TWOPI;
return x;
}
Expand Down
95 changes: 85 additions & 10 deletions FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx
Original file line number Diff line number Diff line change
@@ -1,22 +1,35 @@
/*
Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration
*/
#include "gpuQ.h"
#include "Rand4Hits.h"
#include "DEV_BigMem.h"

#include <omp.h>
#include <cuda_runtime_api.h>
#include <curand.h>

#include "GpuParams.h"
#include "Rand4Hits_cpu.cxx"

#define CURAND_CALL( x ) \
if ( ( x ) != CURAND_STATUS_SUCCESS ) { \
printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \
exit( EXIT_FAILURE ); \
}
#include <omp.h>
#ifdef RNDGEN_OMP
# include "openmp_rng.h"
#endif

#ifdef OMP_OFFLOAD_TARGET_NVIDIA
# include "gpuQ.h"
# include <cuda_runtime_api.h>
# include <curand.h>
# define CURAND_CALL( x ) \
if ( ( x ) != CURAND_STATUS_SUCCESS ) { \
printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \
exit( EXIT_FAILURE ); \
}
#elif defined OMP_OFFLOAD_TARGET_AMD
# include "hip/hip_runtime.h"
# include <rocrand.h>
# define ROCRAND_CALL( x ) \
if ((x) != ROCRAND_STATUS_SUCCESS) { \
printf("Error at %s:%d\n", __FILE__, __LINE__); \
exit(EXIT_FAILURE); \
}
#endif

void Rand4Hits::allocate_simulation( int maxbins, int maxhitct, unsigned long n_cells ) {

Expand Down Expand Up @@ -67,12 +80,23 @@ Rand4Hits::~Rand4Hits() {
<< " lost: " << DEV_BigMem::bm_ptr->lost() << std::endl;
delete DEV_BigMem::bm_ptr;
}
#ifdef RNDGEN_OMP
omp_target_free( m_rand_ptr, m_select_device );
#endif
if ( m_useCPU ) {
destroyCPUGen();
} else {
#ifndef RNDGEN_OMP
#ifndef USE_RANDOM123
#ifdef OMP_OFFLOAD_TARGET_NVIDIA
CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) );
delete (curandGenerator_t*)m_gen;
#elif defined OMP_OFFLOAD_TARGET_AMD
ROCRAND_CALL(rocrand_destroy_generator( *( (rocrand_generator*)m_gen)));
delete (rocrand_generator *)m_gen;
#endif
#endif
#endif
}
};

Expand All @@ -84,7 +108,28 @@ void Rand4Hits::rd_regen() {
std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl;
}
} else {
#ifdef RNDGEN_OMP
auto gen = generator_enum::xorwow;
# ifdef USE_RANDOM123
float* f_r123 = (float*) malloc ( 3 * m_total_a_hits * sizeof( float ) );
omp_get_rng_uniform_float(f_r123, 3 * m_total_a_hits, m_seed, gen);
if ( omp_target_memcpy( m_rand_ptr, f_r123, 3 * m_total_a_hits * sizeof( float ), m_offset, m_offset, m_select_device,
m_initial_device ) ) {
std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl;
}
free(f_r123);
# else
omp_get_rng_uniform_float(m_rand_ptr, 3 * m_total_a_hits, m_seed, gen);
# endif
#else
# ifndef RNDGEN_CPU
# ifdef OMP_OFFLOAD_TARGET_NVIDIA
CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) );
# elif defined OMP_OFFLOAD_TARGET_AMD
ROCRAND_CALL(rocrand_generate_uniform( *( (rocrand_generator*)m_gen), m_rand_ptr, 3 * m_total_a_hits));
# endif
# endif
#endif
}
};

Expand All @@ -105,12 +150,42 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) {
std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl;
}
} else {
#ifdef RNDGEN_OMP
f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device );
auto gen = generator_enum::xorwow;
#ifdef USE_RANDOM123
float* f_r123 = (float*) malloc ( num * sizeof( float ) );
omp_get_rng_uniform_float(f_r123, num, seed, gen);
if ( omp_target_memcpy( f, f_r123, num * sizeof( float ), m_offset, m_offset, m_select_device,
m_initial_device ) ) {
std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl;
}
free(f_r123);
#else
omp_get_rng_uniform_float(f, num, seed, gen);
#endif
m_gen = (void*)gen;
// We need to save the seed for rd_regen
m_seed = seed;
#else
#ifndef RNDGEN_CPU
#ifdef OMP_OFFLOAD_TARGET_NVIDIA
gpuQ( cudaMalloc( &f, num * sizeof( float ) ) );
curandGenerator_t* gen = new curandGenerator_t;
CURAND_CALL( curandCreateGenerator( gen, CURAND_RNG_PSEUDO_DEFAULT ) );
CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) );
CURAND_CALL( curandGenerateUniform( *gen, f, num ) );
m_gen = (void*)gen;
#elif defined OMP_OFFLOAD_TARGET_AMD
hipMalloc(&f, num * sizeof(float));
rocrand_generator* gen = new rocrand_generator;
ROCRAND_CALL(rocrand_create_generator(gen, ROCRAND_RNG_PSEUDO_DEFAULT));
ROCRAND_CALL(rocrand_set_seed(*gen, seed));
ROCRAND_CALL(rocrand_generate_uniform(*gen, f, num));
m_gen = (void*)gen;
#endif
#endif
#endif
}

m_rand_ptr = f;
Expand Down
2 changes: 2 additions & 0 deletions FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
*/

#ifdef USE_OMPGPU
#ifdef OMP_OFFLOAD_TARGET_NVIDIA
#include "gpuQ.h"
#include <iostream>

Expand All @@ -13,6 +14,7 @@ void gpu_assert(cudaError_t code, const char *file, const int line) {
exit(code);
}
}
#endif
#else
#include "gpuQ.cu"
#endif
Loading