From 366b0d5195263d49bad7933b0cc5ef8786014fa9 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Wed, 29 May 2024 07:49:47 -0400 Subject: [PATCH 01/25] hip for nvidia backend compiles --- FastCaloSimAnalyzer/CMakeLists.txt | 3 - .../FastCaloGpu/FastCaloGpu/HostDevDef.h | 3 + .../FastCaloGpu/src/CMakeLists.txt | 21 ++- .../FastCaloGpu/src/CaloGpuGeneral.cxx | 20 ++- .../FastCaloGpu/src/Rand4Hits_hip.cxx | 21 +++ .../Root/TFCSShapeValidation.cxx | 120 +++++++++--------- README.md | 5 + scripts/hipcc_nvidia | 36 ++++++ 8 files changed, 161 insertions(+), 68 deletions(-) create mode 100755 scripts/hipcc_nvidia diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 61665de..630f05a 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -45,14 +45,11 @@ elseif( USE_KOKKOS ) find_package(Kokkos) elseif(USE_ALPAKA) find_package(alpaka REQUIRED) -elseif(USE_HIP) - find_package(HIP REQUIRED) endif() include(FastCaloSim) include(XRootD) - set(PROJECT_SRC_DIR ${CMAKE_SOURCE_DIR}/Root) add_subdirectory(FastCaloSimCommon/src FastCaloSimCommon) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h index 2a8499a..3f45817 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h @@ -18,6 +18,9 @@ #define __HOSTDEV__ #define __INLINE__ inline #elif defined(USE_HIP) + #if defined(HIP_TARGET_NVIDIA) + #include "cuda_runtime.h" + #endif #define __DEVICE__ __device__ #define __HOST__ __host__ #define __HOSTDEV__ __host__ __device__ diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 830375e..c02b216 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -55,17 +55,27 @@ elseif(ENABLE_OMPGPU) elseif(USE_ALPAKA) set(FastCaloGpu_Srcs CaloGpuGeneral.cxx KernelWrapper_al.cxx Rand4Hits_al.cxx ) elseif(USE_HIP) - include_directories( /opt/rocm/hip/include ) + + # Define ROCM_PATH if not defined + if (NOT DEFINED ROCM_PATH) + set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.") + endif() + include_directories( ${ROCM_PATH}/hip/include ) + if ( ${HIP_TARGET} STREQUAL "AMD" ) message(STATUS " Using AMD HIP backend") add_compile_definitions(__HIP_PLATFORM_AMD__ HIP_PLATFORM_AMD) + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) message(STATUS " Using NVIDIA HIP backend") add_compile_definitions(__HIP_PLATFORM_NVIDIA__ HIP_PLATFORM_NVIDIA) + find_package(CUDAToolkit REQUIRED) + set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86") + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") else() - message(FATAL_ERROR "unknown HIP_TARGET=${HIP_TARGET}. Must be either AMD or NVIDIA") + message(FATAL_ERROR "unknown HIP_TARGET=${HIP_TARGET}. Must be either AMD or NVIDIA") endif() - add_compile_definitions(__HIP_PLATFORM_HCC__ HIP_PLATFORM_HCC) + #add_compile_definitions(__HIP_PLATFORM_HCC__ HIP_PLATFORM_HCC) set(FastCaloGpu_Srcs GeoLoadGpu.cxx DEV_BigMem_hip.cxx KernelWrapper_hip.cxx gpuQ_hip.cxx Rand4Hits_hip.cxx ) @@ -101,6 +111,11 @@ if(USE_ALPAKA) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} alpaka::alpaka) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) elseif(USE_HIP) + if("${HIP_TARGET}" STREQUAL "NVIDIA") + target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + endif() target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) target_include_directories(${FastCaloGpu_LIB} PRIVATE ../FastCaloGpu/ } ) target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index 0ebd1d2..b27b94c 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -12,6 +12,8 @@ #include "Rand4Hits.h" #include #include +#include "gpuQ.h" +#include "hip/hip_runtime.h" void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, unsigned long long seed, bool /*hitspy*/) { @@ -70,6 +72,20 @@ void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, std::cout << "using HIP on "; #ifdef __HIP_PLATFORM_NVIDIA__ std::cout << "NVIDIA\n"; + int nDevices; + cudaGetDeviceCount(&nDevices); + for (int i = 0; i < nDevices; i++) { + cudaDeviceProp prop; + cudaGetDeviceProperties(&prop, i); + printf("Device Number: %d\n", i); + printf(" Device name: %s\n", prop.name); + printf(" Memory Clock Rate (KHz): %d\n", + prop.memoryClockRate); + printf(" Memory Bus Width (bits): %d\n", + prop.memoryBusWidth); + printf(" Peak Memory Bandwidth (GB/s): %f\n\n", + 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); + } #elif defined __HIP_PLATFORM_AMD__ std::cout << "AMD\n"; #else @@ -126,7 +142,7 @@ void CaloGpuGeneral::simulate_hits_gr(Sim_Args &args) { #elif defined USE_OMPGPU CaloGpuGeneral_omp::simulate_hits_gr(args); #else - CaloGpuGeneral_cu::simulate_hits_gr(args); + //CaloGpuGeneral_cu::simulate_hits_gr(args); #endif } @@ -147,6 +163,6 @@ void CaloGpuGeneral::load_hitsim_params(void *rd4h, HitParams *hp, #elif defined(USE_OMPGPU) CaloGpuGeneral_omp::load_hitsim_params(rd4h, hp, simbins, bins); #else - CaloGpuGeneral_cu::load_hitsim_params(rd4h, hp, simbins, bins); + //CaloGpuGeneral_cu::load_hitsim_params(rd4h, hp, simbins, bins); #endif } diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx index 5fefaa5..7bc8c2d 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx @@ -9,6 +9,10 @@ #include #include +#if defined (__HIP_PLATFORM_NVIDIA__) +#include +#endif + #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" @@ -90,8 +94,13 @@ Rand4Hits::~Rand4Hits() { if (m_useCPU) { destroyCPUGen(); } else { +#if defined (__HIP_PLATFORM_NVIDIA__) + CURAND_CALL(curandDestroyGenerator(*((curandGenerator_t *)m_gen))); + delete (curandGenerator_t *)m_gen; +#else CURAND_CALL(hiprandDestroyGenerator(*((hiprandGenerator_t *)m_gen))); delete (hiprandGenerator_t *)m_gen; +#endif } }; @@ -104,8 +113,13 @@ void Rand4Hits::rd_regen() { hipMemcpyHostToDevice)); #endif } else { +#if defined (__HIP_PLATFORM_NVIDIA__) + CURAND_CALL(curandGenerateUniform(*((curandGenerator_t *)m_gen), m_rand_ptr, + 3 * m_total_a_hits)); +#else CURAND_CALL(hiprandGenerateUniform(*((hiprandGenerator_t *)m_gen), m_rand_ptr, 3 * m_total_a_hits)); +#endif } }; @@ -128,10 +142,17 @@ void Rand4Hits::create_gen(unsigned long long seed, size_t num, bool useCPU) { #endif } else { gpuQ(hipMalloc(&f, num * sizeof(float))); +#if defined (__HIP_PLATFORM_NVIDIA__) + curandGenerator_t *gen = new curandGenerator_t; + CURAND_CALL(curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(*gen, seed)); + CURAND_CALL(curandGenerateUniform(*gen, f, num)); +#else hiprandGenerator_t *gen = new hiprandGenerator_t; CURAND_CALL(hiprandCreateGenerator(gen, HIPRAND_RNG_PSEUDO_DEFAULT)); CURAND_CALL(hiprandSetPseudoRandomGeneratorSeed(*gen, seed)); CURAND_CALL(hiprandGenerateUniform(*gen, f, num)); +#endif m_gen = (void *)gen; } diff --git a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx index f506ebf..602ccee 100644 --- a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx +++ b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx @@ -431,68 +431,68 @@ void TFCSShapeValidation::LoopEvents( int pcabin = -1 ) { } // else std::cout<<"Skipping GPU for E/P: " << ievent<< " , "<< p << std::endl ; - bin_index = ( *( chain_simul.get_es() ) ).bin_index; - tot_hits = ( *( chain_simul.get_es() ) ).tot_hits; - n_simbins = ( *( chain_simul.get_es() ) ).n_simbins; - if ( index >= MAX_SIM || tot_hits > ( MAXHITS - 100000 ) || es.is_last ) { +// bin_index = ( *( chain_simul.get_es() ) ).bin_index; +// tot_hits = ( *( chain_simul.get_es() ) ).tot_hits; +// n_simbins = ( *( chain_simul.get_es() ) ).n_simbins; +// if ( index >= MAX_SIM || tot_hits > ( MAXHITS - 100000 ) || es.is_last ) { // here need to do GPU simulation !!!!. - auto tg_s = std::chrono::system_clock::now(); - CaloGpuGeneral::load_hitsim_params( m_rd4h, &( hitparams[0] ), &( simbins[0] ), n_simbins ); - auto tg_s_A = std::chrono::system_clock::now(); - t_g_sim_A += tg_s_A - tg_s; - - Sim_Args args; - args.debug = m_debug; - args.rd4h = m_rd4h; - args.geo = GeoLoadGpu::Geo_g; - args.cells_energy = nullptr; - args.hitcells_E = nullptr; - args.hitcells_E_h = nullptr; - args.ct = nullptr; - args.ct_h = nullptr; - args.hitparams = nullptr; - args.hitparams_h = hitparams; - args.simbins = nullptr; - args.nbins = n_simbins; - args.nsims = index; - args.nhits = tot_hits; - args.ncells = GeoLoadGpu::num_cells; - - - - - CaloGpuGeneral::simulate_hits_gr( args ); - auto tg_s_B = std::chrono::system_clock::now(); - t_g_sim_B += tg_s_B - tg_s; - - - for ( int isim = 0; isim < index; isim++ ) { - TFCSSimulationState& sim = m_validations[g_sims_v[isim]].simul()[g_sims_st[isim]]; - // std::cout << "gpucellCT["<200000 || args.hitcells_E_h[ii+isim*MAXHITCT].cellid <=0 - // ) std::cout << "Something Wrong cellid: " << args.hitcells_E_h[ii+isim*MAXHITCT].cellid <<", - // isim="<index2cell( args.hitcells_E_h[ii + isim * MAXHITCT].cellid ); - // std::cout << ",Is" << isim <<"Ic"<index2cell( args.hitcells_E_h[ii + isim * MAXHITCT].cellid ); +// // std::cout << ",Is" << isim <<"Ic"< Date: Thu, 30 May 2024 10:27:03 -0700 Subject: [PATCH 02/25] perlmutter --- .../FastCaloGpu/src/CMakeLists.txt | 6 +- .../FastCaloGpu/src/Rand4Hits_hip.cxx | 20 ++- .../Root/TFCSShapeValidation.cxx | 118 +++++++++--------- README.md | 12 ++ 4 files changed, 88 insertions(+), 68 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index c02b216..2b968f2 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -118,10 +118,10 @@ elseif(USE_HIP) endif() target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) target_include_directories(${FastCaloGpu_LIB} PRIVATE ../FastCaloGpu/ } ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) + #target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) + #target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) - target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) + #target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx index 7bc8c2d..f68efa5 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx @@ -7,7 +7,7 @@ #include "DEV_BigMem.h" #include -#include +//#include #if defined (__HIP_PLATFORM_NVIDIA__) #include @@ -16,11 +16,19 @@ #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" -#define CURAND_CALL(x) \ - if ((x) != HIPRAND_STATUS_SUCCESS) { \ - printf("Error at %s:%d\n", __FILE__, __LINE__); \ - exit(EXIT_FAILURE); \ - } +#if defined (__HIP_PLATFORM_NVIDIA__) + #define CURAND_CALL(x) \ + if ((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } +#else + #define CURAND_CALL(x) \ + if ((x) != HIPRAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif #ifndef USE_STDPAR void Rand4Hits::allocate_simulation(int maxbins, int maxhitct, diff --git a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx index 602ccee..4cfa8c6 100644 --- a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx +++ b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx @@ -431,68 +431,68 @@ void TFCSShapeValidation::LoopEvents( int pcabin = -1 ) { } // else std::cout<<"Skipping GPU for E/P: " << ievent<< " , "<< p << std::endl ; -// bin_index = ( *( chain_simul.get_es() ) ).bin_index; -// tot_hits = ( *( chain_simul.get_es() ) ).tot_hits; -// n_simbins = ( *( chain_simul.get_es() ) ).n_simbins; -// if ( index >= MAX_SIM || tot_hits > ( MAXHITS - 100000 ) || es.is_last ) { + bin_index = ( *( chain_simul.get_es() ) ).bin_index; + tot_hits = ( *( chain_simul.get_es() ) ).tot_hits; + n_simbins = ( *( chain_simul.get_es() ) ).n_simbins; + if ( index >= MAX_SIM || tot_hits > ( MAXHITS - 100000 ) || es.is_last ) { // here need to do GPU simulation !!!!. -// auto tg_s = std::chrono::system_clock::now(); -// CaloGpuGeneral::load_hitsim_params( m_rd4h, &( hitparams[0] ), &( simbins[0] ), n_simbins ); -// auto tg_s_A = std::chrono::system_clock::now(); -// t_g_sim_A += tg_s_A - tg_s; -// -// Sim_Args args; -// args.debug = m_debug; -// args.rd4h = m_rd4h; -// args.geo = GeoLoadGpu::Geo_g; -// args.cells_energy = nullptr; -// args.hitcells_E = nullptr; -// args.hitcells_E_h = nullptr; -// args.ct = nullptr; -// args.ct_h = nullptr; -// args.hitparams = nullptr; -// args.hitparams_h = hitparams; -// args.simbins = nullptr; -// args.nbins = n_simbins; -// args.nsims = index; -// args.nhits = tot_hits; -// args.ncells = GeoLoadGpu::num_cells; -// -// -// -// -// CaloGpuGeneral::simulate_hits_gr( args ); -// auto tg_s_B = std::chrono::system_clock::now(); -// t_g_sim_B += tg_s_B - tg_s; -// -// -// for ( int isim = 0; isim < index; isim++ ) { -// TFCSSimulationState& sim = m_validations[g_sims_v[isim]].simul()[g_sims_st[isim]]; -// // std::cout << "gpucellCT["<200000 || args.hitcells_E_h[ii+isim*MAXHITCT].cellid <=0 -// // ) std::cout << "Something Wrong cellid: " << args.hitcells_E_h[ii+isim*MAXHITCT].cellid <<", -// // isim="<index2cell( args.hitcells_E_h[ii + isim * MAXHITCT].cellid ); -// // std::cout << ",Is" << isim <<"Ic"<index2cell( args.hitcells_E_h[ii + isim * MAXHITCT].cellid ); + // std::cout << ",Is" << isim <<"Ic"< Date: Thu, 30 May 2024 10:41:20 -0700 Subject: [PATCH 03/25] uncommented Rand4Hits_finish --- FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx index 4cfa8c6..f506ebf 100644 --- a/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx +++ b/FastCaloSimAnalyzer/Root/TFCSShapeValidation.cxx @@ -503,7 +503,7 @@ void TFCSShapeValidation::LoopEvents( int pcabin = -1 ) { } // end loop over events // auto t_04 = std::chrono::system_clock::now(); #ifdef USE_GPU -// if ( m_rd4h ) CaloGpuGeneral::Rand4Hits_finish( m_rd4h ); + if ( m_rd4h ) CaloGpuGeneral::Rand4Hits_finish( m_rd4h ); #endif auto t3 = std::chrono::system_clock::now(); From 396c780ce7cc8ba363776132a9e63992826a0f31 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Thu, 30 May 2024 11:00:58 -0700 Subject: [PATCH 04/25] uncommented load_hit_sim and simulate --- FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index b27b94c..75ffec2 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -142,7 +142,7 @@ void CaloGpuGeneral::simulate_hits_gr(Sim_Args &args) { #elif defined USE_OMPGPU CaloGpuGeneral_omp::simulate_hits_gr(args); #else - //CaloGpuGeneral_cu::simulate_hits_gr(args); + CaloGpuGeneral_cu::simulate_hits_gr(args); #endif } @@ -163,6 +163,6 @@ void CaloGpuGeneral::load_hitsim_params(void *rd4h, HitParams *hp, #elif defined(USE_OMPGPU) CaloGpuGeneral_omp::load_hitsim_params(rd4h, hp, simbins, bins); #else - //CaloGpuGeneral_cu::load_hitsim_params(rd4h, hp, simbins, bins); + CaloGpuGeneral_cu::load_hitsim_params(rd4h, hp, simbins, bins); #endif } From 8cb14b2f6c3697adf47042c28886aec744a8bfe3 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Tue, 11 Jun 2024 08:55:33 -0700 Subject: [PATCH 05/25] conditional compilation for RNDCPU_GEN --- .../FastCaloGpu/FastCaloGpu/HostDevDef.h | 3 ++ .../FastCaloGpu/src/CMakeLists.txt | 31 +++++++++++-------- .../FastCaloGpu/src/CaloGpuGeneral.cxx | 4 +-- .../FastCaloGpu/src/Rand4Hits_hip.cxx | 21 +++++++++---- README.md | 11 +++++++ 5 files changed, 49 insertions(+), 21 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h index 3f45817..d84e7f1 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h @@ -21,6 +21,9 @@ #if defined(HIP_TARGET_NVIDIA) #include "cuda_runtime.h" #endif + #if defined(HIP_TARGET_AMD) + #include "hip_runtime.h" + #endif #define __DEVICE__ __device__ #define __HOST__ __host__ #define __HOSTDEV__ __host__ __device__ diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 2b968f2..6b9eff7 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -64,18 +64,15 @@ elseif(USE_HIP) if ( ${HIP_TARGET} STREQUAL "AMD" ) message(STATUS " Using AMD HIP backend") - add_compile_definitions(__HIP_PLATFORM_AMD__ HIP_PLATFORM_AMD) set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) message(STATUS " Using NVIDIA HIP backend") - add_compile_definitions(__HIP_PLATFORM_NVIDIA__ HIP_PLATFORM_NVIDIA) find_package(CUDAToolkit REQUIRED) set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86") set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") else() - message(FATAL_ERROR "unknown HIP_TARGET=${HIP_TARGET}. Must be either AMD or NVIDIA") + message(FATAL_ERROR "unknown HIP_TARGET=${HIP_TARGET}. Must be either AMD or NVIDIA") endif() - #add_compile_definitions(__HIP_PLATFORM_HCC__ HIP_PLATFORM_HCC) set(FastCaloGpu_Srcs GeoLoadGpu.cxx DEV_BigMem_hip.cxx KernelWrapper_hip.cxx gpuQ_hip.cxx Rand4Hits_hip.cxx ) @@ -111,17 +108,16 @@ if(USE_ALPAKA) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} alpaka::alpaka) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) elseif(USE_HIP) - if("${HIP_TARGET}" STREQUAL "NVIDIA") - target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart) - target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA) - target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) - endif() target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) target_include_directories(${FastCaloGpu_LIB} PRIVATE ../FastCaloGpu/ } ) - #target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) - #target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) - - #target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) + + if ( ${HIP_TARGET} STREQUAL "AMD" ) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_AMD) + elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart) + endif() + else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() @@ -152,6 +148,15 @@ endif() if(RNDGEN_CPU) message(STATUS "Will generate random numbers on CPU") target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU ) + # TODO Link a portable RNG library +else() + if ( ${HIP_TARGET} STREQUAL "AMD" ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) + elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + endif() endif() if(DUMP_HITCELLS) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index 75ffec2..ee1ca87 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -70,7 +70,7 @@ void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, std::cout << "using OpenMP GPU\n"; #elif defined(USE_HIP) std::cout << "using HIP on "; - #ifdef __HIP_PLATFORM_NVIDIA__ + #ifdef HIP_TARGET_NVIDIA std::cout << "NVIDIA\n"; int nDevices; cudaGetDeviceCount(&nDevices); @@ -86,7 +86,7 @@ void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, printf(" Peak Memory Bandwidth (GB/s): %f\n\n", 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); } - #elif defined __HIP_PLATFORM_AMD__ + #elif defined HIP_TARGET_AMD std::cout << "AMD\n"; #else std::cout << "UNKNOWN\n"; diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx index f68efa5..d9f1111 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx @@ -7,16 +7,19 @@ #include "DEV_BigMem.h" #include -//#include -#if defined (__HIP_PLATFORM_NVIDIA__) +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) #include +#else +#include +#endif #endif #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" -#if defined (__HIP_PLATFORM_NVIDIA__) +#if defined (HIP_TARGET_NVIDIA) #define CURAND_CALL(x) \ if ((x) != CURAND_STATUS_SUCCESS) { \ printf("Error at %s:%d\n", __FILE__, __LINE__); \ @@ -102,12 +105,14 @@ Rand4Hits::~Rand4Hits() { if (m_useCPU) { destroyCPUGen(); } else { -#if defined (__HIP_PLATFORM_NVIDIA__) +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) CURAND_CALL(curandDestroyGenerator(*((curandGenerator_t *)m_gen))); delete (curandGenerator_t *)m_gen; #else CURAND_CALL(hiprandDestroyGenerator(*((hiprandGenerator_t *)m_gen))); delete (hiprandGenerator_t *)m_gen; +#endif #endif } }; @@ -121,12 +126,14 @@ void Rand4Hits::rd_regen() { hipMemcpyHostToDevice)); #endif } else { -#if defined (__HIP_PLATFORM_NVIDIA__) +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) CURAND_CALL(curandGenerateUniform(*((curandGenerator_t *)m_gen), m_rand_ptr, 3 * m_total_a_hits)); #else CURAND_CALL(hiprandGenerateUniform(*((hiprandGenerator_t *)m_gen), m_rand_ptr, 3 * m_total_a_hits)); +#endif #endif } }; @@ -149,8 +156,9 @@ void Rand4Hits::create_gen(unsigned long long seed, size_t num, bool useCPU) { hipMemcpyHostToDevice)); #endif } else { +#ifndef RNDGEN_CPU gpuQ(hipMalloc(&f, num * sizeof(float))); -#if defined (__HIP_PLATFORM_NVIDIA__) +#if defined (HIP_TARGET_NVIDIA) curandGenerator_t *gen = new curandGenerator_t; CURAND_CALL(curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CALL(curandSetPseudoRandomGeneratorSeed(*gen, seed)); @@ -162,6 +170,7 @@ void Rand4Hits::create_gen(unsigned long long seed, size_t num, bool useCPU) { CURAND_CALL(hiprandGenerateUniform(*gen, f, num)); #endif m_gen = (void *)gen; +#endif } m_rand_ptr = f; diff --git a/README.md b/README.md index 7d79c7d..da5b88c 100644 --- a/README.md +++ b/README.md @@ -175,13 +175,24 @@ cmake ../src/FastCaloSimAnalyzer \ ``` ## BNL CSI lambda2 +``` export HIP_PLATFORM=nvidia export HIP_COMPILER=nvcc export HIP_RUNTIME=cuda /work/atif/packages/cmake-3.25.0-linux-x86_64/bin/cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/work/atif/FCS-GPU-orig/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA +``` ## Perlmutter +For Nvidia backend with HIP select HIP_PLATFORM=nvidia, HIP_COMPILER=nvcc, HIP_RUNTIME=cuda +``` +export HIP_PLATFORM=nvidia +export HIP_COMPILER=nvcc +export HIP_RUNTIME=cuda +module load hip +export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs +source /global/homes/a/atif/packages/root_install/bin/thisroot.sh cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" -DRNDGEN_CPU=on +``` ### alpaka From 28cfd3e77869234fb2055fbf19ae6ad5fd9c04ff Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Tue, 11 Jun 2024 12:29:38 -0400 Subject: [PATCH 06/25] added build instruction for lambda2 HIP AMD backend --- .../FastCaloGpu/src/CaloGpuGeneral.cxx | 14 -------------- README.md | 14 ++++++++------ 2 files changed, 8 insertions(+), 20 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index ee1ca87..c737730 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -72,20 +72,6 @@ void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, std::cout << "using HIP on "; #ifdef HIP_TARGET_NVIDIA std::cout << "NVIDIA\n"; - int nDevices; - cudaGetDeviceCount(&nDevices); - for (int i = 0; i < nDevices; i++) { - cudaDeviceProp prop; - cudaGetDeviceProperties(&prop, i); - printf("Device Number: %d\n", i); - printf(" Device name: %s\n", prop.name); - printf(" Memory Clock Rate (KHz): %d\n", - prop.memoryClockRate); - printf(" Memory Bus Width (bits): %d\n", - prop.memoryBusWidth); - printf(" Peak Memory Bandwidth (GB/s): %f\n\n", - 2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6); - } #elif defined HIP_TARGET_AMD std::cout << "AMD\n"; #else diff --git a/README.md b/README.md index da5b88c..8fb5e24 100644 --- a/README.md +++ b/README.md @@ -174,16 +174,18 @@ cmake ../src/FastCaloSimAnalyzer \ -DENABLE_GPU=on -DUSE_HIP ``` -## BNL CSI lambda2 +## BNL CSI lambda2: HIP for AMD ``` -export HIP_PLATFORM=nvidia -export HIP_COMPILER=nvcc -export HIP_RUNTIME=cuda -/work/atif/packages/cmake-3.25.0-linux-x86_64/bin/cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/work/atif/FCS-GPU-orig/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA +module use /work/software/modulefiles +module load rocm/4.5.0 +source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh +export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ +/work/atif/packages/cmake-3.25.0-linux-x86_64/bin/cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=AMD -DCMAKE_CXX_FLAGS="-I/opt/rocm/hip/include/hip/" ``` -## Perlmutter +## Perlmutter: HIP for Nvidia For Nvidia backend with HIP select HIP_PLATFORM=nvidia, HIP_COMPILER=nvcc, HIP_RUNTIME=cuda +and use hipcc_nvidia compiler script ``` export HIP_PLATFORM=nvidia export HIP_COMPILER=nvcc From 63927bd04c137bc495f2adc7b3c51df797209bc5 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Wed, 26 Jun 2024 10:16:33 -0400 Subject: [PATCH 07/25] added checks for envs and rngs --- FastCaloSimAnalyzer/CMakeLists.txt | 6 ++++++ FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt | 10 ++++++---- README.md | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 630f05a..59526c8 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -43,6 +43,12 @@ if ( USE_STDPAR ) message (STATUS "Will target ${STDPAR_TARGET} for std::par with ${STDPAR_DIRECTIVE}") elseif( USE_KOKKOS ) find_package(Kokkos) +elseif( USE_HIP ) + if ( ${HIP_TARGET} STREQUAL "NVIDIA" ) + if ( NOT RNDGEN_CPU ) + message(FATAL_ERROR "when HIP_TARGET=NVIDIA, RNDGEN_CPU must be ON") + endif() + endif() elseif(USE_ALPAKA) find_package(alpaka REQUIRED) endif() diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 6b9eff7..8f5c07a 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -57,10 +57,12 @@ elseif(USE_ALPAKA) elseif(USE_HIP) # Define ROCM_PATH if not defined - if (NOT DEFINED ROCM_PATH) + if (NOT DEFINED ENV{ROCM_PATH}) + message(WARNING "Environment variable ROCM_PATH not set! Using default /opt/rocm/") set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.") + else() + set(ROCM_PATH $ENV{ROCM_PATH}) endif() - include_directories( ${ROCM_PATH}/hip/include ) if ( ${HIP_TARGET} STREQUAL "AMD" ) message(STATUS " Using AMD HIP backend") @@ -151,9 +153,9 @@ if(RNDGEN_CPU) # TODO Link a portable RNG library else() if ( ${HIP_TARGET} STREQUAL "AMD" ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) - target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so) elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() diff --git a/README.md b/README.md index 8fb5e24..36adcd0 100644 --- a/README.md +++ b/README.md @@ -177,7 +177,7 @@ cmake ../src/FastCaloSimAnalyzer \ ## BNL CSI lambda2: HIP for AMD ``` module use /work/software/modulefiles -module load rocm/4.5.0 +module load rocmmod4.5.0 source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ /work/atif/packages/cmake-3.25.0-linux-x86_64/bin/cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=AMD -DCMAKE_CXX_FLAGS="-I/opt/rocm/hip/include/hip/" From a4591f0600cf9dd8d0901831e385d13f64b8832b Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Wed, 26 Jun 2024 10:53:36 -0400 Subject: [PATCH 08/25] cleaned CMakeLists --- FastCaloSimAnalyzer/CMakeLists.txt | 1 + FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt | 5 +++-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 59526c8..537c80a 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -56,6 +56,7 @@ endif() include(FastCaloSim) include(XRootD) + set(PROJECT_SRC_DIR ${CMAKE_SOURCE_DIR}/Root) add_subdirectory(FastCaloSimCommon/src FastCaloSimCommon) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 8f5c07a..bfd1e55 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -63,7 +63,8 @@ elseif(USE_HIP) else() set(ROCM_PATH $ENV{ROCM_PATH}) endif() - + include_directories( ${ROCM_PATH}/hip/include ) + if ( ${HIP_TARGET} STREQUAL "AMD" ) message(STATUS " Using AMD HIP backend") set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") @@ -154,7 +155,7 @@ if(RNDGEN_CPU) else() if ( ${HIP_TARGET} STREQUAL "AMD" ) target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so) elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) From 56caafcc7de10ece45083711ea432ac51f647890 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Wed, 26 Jun 2024 11:04:32 -0400 Subject: [PATCH 09/25] cleaned CMakeLists --- FastCaloSimAnalyzer/CMakeLists.txt | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 537c80a..3a276d2 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -43,14 +43,15 @@ if ( USE_STDPAR ) message (STATUS "Will target ${STDPAR_TARGET} for std::par with ${STDPAR_DIRECTIVE}") elseif( USE_KOKKOS ) find_package(Kokkos) -elseif( USE_HIP ) +elseif(USE_ALPAKA) + find_package(alpaka REQUIRED) +elseif(USE_HIP) + find_package(HIP REQUIRED) if ( ${HIP_TARGET} STREQUAL "NVIDIA" ) if ( NOT RNDGEN_CPU ) message(FATAL_ERROR "when HIP_TARGET=NVIDIA, RNDGEN_CPU must be ON") endif() endif() -elseif(USE_ALPAKA) - find_package(alpaka REQUIRED) endif() include(FastCaloSim) From 94ba113b665bdbf23843bb00c7927ec591ebf0ec Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Sun, 30 Jun 2024 22:40:51 -0400 Subject: [PATCH 10/25] removed extra hip_runtime include --- FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx | 2 -- 1 file changed, 2 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index c737730..a5705d6 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -12,8 +12,6 @@ #include "Rand4Hits.h" #include #include -#include "gpuQ.h" -#include "hip/hip_runtime.h" void *CaloGpuGeneral::Rand4Hits_init(long long maxhits, int maxbin, unsigned long long seed, bool /*hitspy*/) { From 827c4a172030a0184613a10fb9a43f42b364938b Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Mon, 1 Jul 2024 10:13:51 -0400 Subject: [PATCH 11/25] first omp-portable-rng apis --- .../FastCaloGpu/src/CMakeLists.txt | 11 +++++++- .../FastCaloGpu/src/Rand4Hits_omp.cxx | 27 ++++++++++++++----- 2 files changed, 31 insertions(+), 7 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index bfd1e55..49ed704 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -151,8 +151,17 @@ endif() if(RNDGEN_CPU) message(STATUS "Will generate random numbers on CPU") target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU ) +elseif(RNDGEN_OMP) # TODO Link a portable RNG library -else() + # TODO locate OMP_PORTABLE_RNG with an env var + set(OMP_RNG "/work/atif/test-benchmark-OpenMP-RNG/" CACHE STRING "Default OMP-Portable-RNG library.") + #set(OMP_RNG "/work/atif/FCS-GPU/omp-portable-rng/" CACHE STRING "Default OMP-Portable-RNG library.") + target_include_directories(${FastCaloGpu_LIB} PUBLIC ${OMP_RNG} ) + target_include_directories(${FastCaloGpu_LIB} PUBLIC ${OMP_RNG}/implementation ) + message(STATUS "Will generate OMP random numbers ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}") + target_link_libraries(${FastCaloGpu_LIB} PRIVATE ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_OMP ) +elseif(USE_HIP) if ( ${HIP_TARGET} STREQUAL "AMD" ) target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 00ea772..3c6a0ce 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -8,6 +8,7 @@ #include #include #include +#include "openmp_rng.h" #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" @@ -105,14 +106,28 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { - gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); - curandGenerator_t* gen = new curandGenerator_t; - CURAND_CALL( curandCreateGenerator( gen, CURAND_RNG_PSEUDO_DEFAULT ) ); - CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) ); - CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); - m_gen = (void*)gen; + //gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); + float* f = (float*) malloc (sizeof(float) * (num)); + // TODO: fix compilation errors with openmp_target_alloc, is_device_ptr + //curandGenerator_t* gen = new curandGenerator_t; + //CURAND_CALL( curandCreateGenerator( gen, CURAND_RNG_PSEUDO_DEFAULT ) ); + auto gen_type = generator_enum::xorwow; + std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; +//#if defined(OMP_RND_ARCH_CUDA) || defined(OMP_RNG_ARCH_HIP) +//#endif + #pragma omp target data map(tofrom:f[0:num]) + { + //CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) ); + //CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); + #pragma omp target data use_device_ptr(f) + omp_get_rng_uniform_float(f, num, seed, gen_type); + + } + m_gen = (void*)gen_type; } + //TODO this needs to be a device pointer enforcing it throws compilation errors + #pragma omp target data use_device_ptr(f) m_rand_ptr = f; std::cout << "R4H m_rand_ptr: " << m_rand_ptr << std::endl; From ef7afae42626f7fc605fd1dbfe68016b651b7212 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Mon, 1 Jul 2024 10:15:21 -0400 Subject: [PATCH 12/25] added readme --- readme_omp_rng.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 readme_omp_rng.md diff --git a/readme_omp_rng.md b/readme_omp_rng.md new file mode 100644 index 0000000..4663874 --- /dev/null +++ b/readme_omp_rng.md @@ -0,0 +1,15 @@ + +## Build Instructions for alpha/lambda @ CSI, BNL +Change OMP_RNG path in FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +according to your location of +git clone https://github.com/GKNB/test-benchmark-OpenMP-atomic.git + +``` +module use /work/software/modulefiles +module load llvm-openmp-dev +source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh +export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ +export OMP_TARGET_OFFLOAD=mandatory +cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_CUDA" +``` + From bb9982dace2362838a50fd50e52a7822eb8ff220 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Mon, 1 Jul 2024 20:47:53 -0400 Subject: [PATCH 13/25] fixed runtime errors --- .../FastCaloGpu/FastCaloGpu/Rand4Hits.h | 1 + .../FastCaloGpu/src/Rand4Hits_omp.cxx | 35 +++++++------------ 2 files changed, 13 insertions(+), 23 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/Rand4Hits.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/Rand4Hits.h index ab706ea..48c9454 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/Rand4Hits.h +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/Rand4Hits.h @@ -170,6 +170,7 @@ class Rand4Hits { unsigned int m_current_hits; void *m_gen{ nullptr }; bool m_useCPU{ false }; + unsigned long long m_seed{0}; // patch in some GPU pointers for cudaMalloc CELL_ENE_T *m_cells_energy{ 0 }; diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 3c6a0ce..713c9bb 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -72,8 +72,9 @@ Rand4Hits::~Rand4Hits() { if ( m_useCPU ) { destroyCPUGen(); } else { - CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) ); - delete (curandGenerator_t*)m_gen; + // TODO: Do we need this for Portable RNG? + // CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) ); + // delete (curandGenerator_t*)m_gen; } }; @@ -85,7 +86,9 @@ void Rand4Hits::rd_regen() { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { - CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); + auto gen = generator_enum::xorwow; + omp_get_rng_uniform_float(m_rand_ptr, 3 * m_total_a_hits, m_seed, gen); + //CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); } }; @@ -106,28 +109,14 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { - //gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); - float* f = (float*) malloc (sizeof(float) * (num)); - // TODO: fix compilation errors with openmp_target_alloc, is_device_ptr - //curandGenerator_t* gen = new curandGenerator_t; - //CURAND_CALL( curandCreateGenerator( gen, CURAND_RNG_PSEUDO_DEFAULT ) ); - auto gen_type = generator_enum::xorwow; - std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; -//#if defined(OMP_RND_ARCH_CUDA) || defined(OMP_RNG_ARCH_HIP) -//#endif - #pragma omp target data map(tofrom:f[0:num]) - { - //CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) ); - //CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); - #pragma omp target data use_device_ptr(f) - omp_get_rng_uniform_float(f, num, seed, gen_type); - - } - m_gen = (void*)gen_type; + f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device ); + auto gen = generator_enum::xorwow; + omp_get_rng_uniform_float(f, num, seed, gen); + m_gen = (void*)gen; + // We need to save the seed for rd_regen + m_seed = seed; } - //TODO this needs to be a device pointer enforcing it throws compilation errors - #pragma omp target data use_device_ptr(f) m_rand_ptr = f; std::cout << "R4H m_rand_ptr: " << m_rand_ptr << std::endl; From d07f92d30d150fcdc42a0cf062b24f2a5b0a6296 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Wed, 17 Jul 2024 21:21:56 -0700 Subject: [PATCH 14/25] added checks for hip for nv --- FastCaloSimAnalyzer/CMakeLists.txt | 4 +++- FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt | 14 ++++++++------ README.md | 6 +++++- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 3a276d2..9e96252 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -46,11 +46,13 @@ elseif( USE_KOKKOS ) elseif(USE_ALPAKA) find_package(alpaka REQUIRED) elseif(USE_HIP) - find_package(HIP REQUIRED) if ( ${HIP_TARGET} STREQUAL "NVIDIA" ) + find_package(HIP) if ( NOT RNDGEN_CPU ) message(FATAL_ERROR "when HIP_TARGET=NVIDIA, RNDGEN_CPU must be ON") endif() + else() + find_package(HIP REQUIRED) endif() endif() diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index bfd1e55..cefd0d7 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -153,12 +153,14 @@ if(RNDGEN_CPU) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU ) # TODO Link a portable RNG library else() - if ( ${HIP_TARGET} STREQUAL "AMD" ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) - target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so) - elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) - target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + if(USE_HIP) + if ( ${HIP_TARGET} STREQUAL "AMD" ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so) + elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + endif() endif() endif() diff --git a/README.md b/README.md index 36adcd0..1c508e9 100644 --- a/README.md +++ b/README.md @@ -193,7 +193,11 @@ export HIP_RUNTIME=cuda module load hip export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs source /global/homes/a/atif/packages/root_install/bin/thisroot.sh -cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" -DRNDGEN_CPU=on +cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc \ + -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia \ + -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on \ + -DHIP_TARGET=NVIDIA -DRNDGEN_CPU=on \ + -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" ``` ### alpaka From 936f4e29260ffb55c2c3d42f3cac6de3aacc295c Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Sat, 27 Jul 2024 13:43:23 -0400 Subject: [PATCH 15/25] commented gpuq for openmp amd bug --- FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx index 0a6a41c..a6cef88 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx @@ -8,7 +8,7 @@ #include "Hit.h" #include "Rand4Hits.h" -#include "gpuQ.h" +//#include "gpuQ.h" #include "Args.h" #include "DEV_BigMem.h" // #include "OMP_BigMem.h" From 4feccf6412e1d42c9c498f0ae093f5ab39a2d67e Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Sat, 27 Jul 2024 14:09:26 -0400 Subject: [PATCH 16/25] amd working state --- FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt | 8 ++++---- .../FastCaloGpu/src/CaloGpuGeneral_omp.cxx | 6 +++--- FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx | 6 +++--- readme_omp_rng.md | 4 +++- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 49ed704..24c399d 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -33,9 +33,9 @@ if(ENABLE_OMPGPU) find_package(OpenMP) if(OPENMP_FOUND) set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-cuda-mode") - set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto") - set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state") - set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " ) + set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto") + set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state") + set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " ) set(OpenMP_FLAGS "-fopenmp --offload-arch=sm_86 -lomp") ## nvidia set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") @@ -51,7 +51,7 @@ if(USE_STDPAR) elseif(USE_KOKKOS) set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_kk.cxx DEV_BigMem_kk.cxx) elseif(ENABLE_OMPGPU) - set(FastCaloGpu_Srcs KernelWrapper_omp.cxx gpuQ.cxx CaloGpuGeneral.cxx DEV_BigMem_omp.cxx ) + set(FastCaloGpu_Srcs KernelWrapper_omp.cxx CaloGpuGeneral.cxx DEV_BigMem_omp.cxx ) elseif(USE_ALPAKA) set(FastCaloGpu_Srcs CaloGpuGeneral.cxx KernelWrapper_al.cxx Rand4Hits_al.cxx ) elseif(USE_HIP) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx index 0a6a41c..6ae9090 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx @@ -8,7 +8,7 @@ #include "Hit.h" #include "Rand4Hits.h" -#include "gpuQ.h" +//#include "gpuQ.h" #include "Args.h" #include "DEV_BigMem.h" // #include "OMP_BigMem.h" @@ -16,8 +16,8 @@ #include #include -#include -#include +// #include +// #include #include #include diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 713c9bb..63c5a13 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -1,13 +1,13 @@ /* Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration */ -#include "gpuQ.h" +// #include "gpuQ.h" #include "Rand4Hits.h" #include "DEV_BigMem.h" #include -#include -#include +// #include +// #include #include "openmp_rng.h" #include "GpuParams.h" diff --git a/readme_omp_rng.md b/readme_omp_rng.md index 4663874..1816ec7 100644 --- a/readme_omp_rng.md +++ b/readme_omp_rng.md @@ -10,6 +10,8 @@ module load llvm-openmp-dev source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ export OMP_TARGET_OFFLOAD=mandatory -cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_CUDA" +cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_CUDA -I/usr/local/cuda/include" ``` +# For AMD +cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_HIP -I/opt/rocm/include -L/opt/rocm/rocrand/lib/ -lrocrand" From 3c5ac3747c6fe023a6daae18400bd22ce6d97862 Mon Sep 17 00:00:00 2001 From: FNU Mohammad Atif Date: Tue, 1 Oct 2024 21:00:51 -0700 Subject: [PATCH 17/25] added parser for openmp offload-arch --- .../FastCaloGpu/src/CMakeLists.txt | 60 +++++++++++++------ .../FastCaloGpu/src/CaloGpuGeneral_omp.cxx | 4 -- .../FastCaloGpu/src/Rand4Hits_omp.cxx | 10 +++- FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx | 2 + 4 files changed, 52 insertions(+), 24 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index cefd0d7..a555975 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -20,30 +20,28 @@ endif() if(USE_HIP) set(FIND_CUDA OFF) endif() - - + +if(ENABLE_OMPGPU) + string(STRIP ${CMAKE_CXX_FLAGS} OMP_OFFLOAD_TARGET) + string(FIND ${CMAKE_CXX_FLAGS} "gfx" OMP_OFFLOAD_TARGET_AMD) + string(FIND ${CMAKE_CXX_FLAGS} "sm_" OMP_OFFLOAD_TARGET_NVIDIA) + if(OMP_OFFLOAD_TARGET_NVIDIA GREATER 0) + message(STATUS "OMP_OFFLOAD_TARGET NVIDIA" ) + elseif(OMP_OFFLOAD_TARGET_AMD GREATER 0) + set(FIND_CUDA OFF) + message(STATUS "OMP_OFFLOAD_TARGET AMD" ) + else() + ## TODO what about when OPENMP_TARGET_OFFLOAD=disabled? + message(FATAL_ERROR "!! Please specify OpenMP offload target via -DCMAKE_CXX_FLAGS=\"--offload-arch=gfx<>|sm_<>\"") + endif() +endif() + if(FIND_CUDA) find_package(CUDA REQUIRED) enable_language( CUDA ) set(CUDA_LIBRARIES PUBLIC ${CUDA_LIBRARIES}) endif() -# Add OpenMP -if(ENABLE_OMPGPU) - find_package(OpenMP) - if(OPENMP_FOUND) - set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-cuda-mode") - set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto") - set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state") - set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " ) - set(OpenMP_FLAGS "-fopenmp --offload-arch=sm_86 -lomp") ## nvidia - set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") - else() - message(WARNING "Configuring with OpenMP GPU but OpenMP is not found!") - endif() -endif() - # Sources if(USE_STDPAR) @@ -51,6 +49,19 @@ if(USE_STDPAR) elseif(USE_KOKKOS) set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_kk.cxx DEV_BigMem_kk.cxx) elseif(ENABLE_OMPGPU) + # Add OpenMP + find_package(OpenMP) + if(OPENMP_FOUND) + set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-cuda-mode") + set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto") + set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state") + set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " ) + set(OpenMP_FLAGS "-fopenmp -lomp") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") + else() + message(WARNING "Configuring with OpenMP GPU but OpenMP is not found!") + endif() set(FastCaloGpu_Srcs KernelWrapper_omp.cxx gpuQ.cxx CaloGpuGeneral.cxx DEV_BigMem_omp.cxx ) elseif(USE_ALPAKA) set(FastCaloGpu_Srcs CaloGpuGeneral.cxx KernelWrapper_al.cxx Rand4Hits_al.cxx ) @@ -120,7 +131,14 @@ elseif(USE_HIP) target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA) target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart) endif() - +elseif(ENABLE_OMPGPU) + if(OMP_OFFLOAD_TARGET_NVIDIA GREATER 0) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DOMP_OFFLOAD_TARGET_NVIDIA) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + endif() + if(OMP_OFFLOAD_TARGET_AMD GREATER 0) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DOMP_OFFLOAD_TARGET_AMD) + endif() else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() @@ -161,6 +179,10 @@ else() elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() + elseif(ENABLE_OMPGPU) + if(OMP_OFFLOAD_TARGET_AMD GREATER 0) + message(FATAL_ERROR "when OMP_TARGET_OFFLOAD to AMD, RNDGEN_CPU must be ON") + endif() endif() endif() diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx index a6cef88..e8aa457 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_omp.cxx @@ -8,16 +8,12 @@ #include "Hit.h" #include "Rand4Hits.h" -//#include "gpuQ.h" #include "Args.h" #include "DEV_BigMem.h" -// #include "OMP_BigMem.h" #include #include #include -#include -#include #include #include diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 00ea772..a1a4b83 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -1,13 +1,15 @@ /* Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration */ -#include "gpuQ.h" #include "Rand4Hits.h" #include "DEV_BigMem.h" #include +#ifdef OMP_OFFLOAD_TARGET_NVIDIA +#include "gpuQ.h" #include #include +#endif #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" @@ -71,8 +73,10 @@ Rand4Hits::~Rand4Hits() { if ( m_useCPU ) { destroyCPUGen(); } else { +#ifdef OMP_OFFLOAD_TARGET_NVIDIA CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) ); delete (curandGenerator_t*)m_gen; +#endif } }; @@ -84,7 +88,9 @@ void Rand4Hits::rd_regen() { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { +#ifdef OMP_OFFLOAD_TARGET_NVIDIA CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); +#endif } }; @@ -105,12 +111,14 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { +#ifdef OMP_OFFLOAD_TARGET_NVIDIA gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); curandGenerator_t* gen = new curandGenerator_t; CURAND_CALL( curandCreateGenerator( gen, CURAND_RNG_PSEUDO_DEFAULT ) ); CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) ); CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); m_gen = (void*)gen; +#endif } m_rand_ptr = f; diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx index 526f532..eb99660 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/gpuQ.cxx @@ -3,6 +3,7 @@ */ #ifdef USE_OMPGPU +#ifdef OMP_OFFLOAD_TARGET_NVIDIA #include "gpuQ.h" #include @@ -13,6 +14,7 @@ void gpu_assert(cudaError_t code, const char *file, const int line) { exit(code); } } +#endif #else #include "gpuQ.cu" #endif From 3d34164d7dec828d6654ffdf16cbe3a06be0022f Mon Sep 17 00:00:00 2001 From: FNU Mohammad Atif Date: Wed, 2 Oct 2024 07:23:03 -0700 Subject: [PATCH 18/25] added openmp multicore cpu --- .../FastCaloGpu/src/CMakeLists.txt | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index a555975..fe3eec5 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -31,8 +31,11 @@ if(ENABLE_OMPGPU) set(FIND_CUDA OFF) message(STATUS "OMP_OFFLOAD_TARGET AMD" ) else() - ## TODO what about when OPENMP_TARGET_OFFLOAD=disabled? - message(FATAL_ERROR "!! Please specify OpenMP offload target via -DCMAKE_CXX_FLAGS=\"--offload-arch=gfx<>|sm_<>\"") + if($ENV{OMP_TARGET_OFFLOAD} MATCHES "disabled") + set(FIND_CUDA OFF) + else() + message(FATAL_ERROR "!! Please specify OpenMP offload target via -DCMAKE_CXX_FLAGS=\"--offload-arch=gfx<>|sm_<>\" or set environment var OMP_TARGET_OFFLOAD=disabled") + endif() endif() endif() @@ -56,7 +59,7 @@ elseif(ENABLE_OMPGPU) set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -foffload-lto") set(OpenMP_OPT_FLAGS "${OpenMP_OPT_FLAGS} -fopenmp-assume-no-thread-state") set(OpenMP_OPT_RMRKS "-Rpass=openmp-opt -Rpass-analysis=openmp-opt -Rpass-missed=openmp-opt " ) - set(OpenMP_FLAGS "-fopenmp -lomp") + set(OpenMP_FLAGS "-fopenmp -lomp -lomptarget") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_FLAGS} ${OpenMP_OPT_FLAGS} ${OpenMP_OPT_RMRKS}") else() @@ -77,10 +80,10 @@ elseif(USE_HIP) include_directories( ${ROCM_PATH}/hip/include ) if ( ${HIP_TARGET} STREQUAL "AMD" ) - message(STATUS " Using AMD HIP backend") + message(STATUS "Using AMD HIP backend") set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) - message(STATUS " Using NVIDIA HIP backend") + message(STATUS "Using NVIDIA HIP backend") find_package(CUDAToolkit REQUIRED) set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86") set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") @@ -181,7 +184,10 @@ else() endif() elseif(ENABLE_OMPGPU) if(OMP_OFFLOAD_TARGET_AMD GREATER 0) - message(FATAL_ERROR "when OMP_TARGET_OFFLOAD to AMD, RNDGEN_CPU must be ON") + message(FATAL_ERROR "when OMP TARGET OFFLOAD to AMD, RNDGEN_CPU must be ON") + endif() + if(ENV{OMP_TARGET_OFFLOAD} MATCHES "disabled") + message(FATAL_ERROR "when OMP_TARGET_OFFLOAD disabled, RNDGEN_CPU must be ON") endif() endif() endif() From 021976637d2f5cd97b1bd061ccdc80f3221904b0 Mon Sep 17 00:00:00 2001 From: FNU Mohammad Atif Date: Wed, 2 Oct 2024 10:30:17 -0700 Subject: [PATCH 19/25] added script, updated readme --- README.md | 5 +- scripts/script_build_gr_all.sh | 313 +++++++++++++++++++++++++++++++++ 2 files changed, 316 insertions(+), 2 deletions(-) create mode 100644 scripts/script_build_gr_all.sh diff --git a/README.md b/README.md index 1c508e9..f3669b8 100644 --- a/README.md +++ b/README.md @@ -214,7 +214,8 @@ export OMP_TARGET_OFFLOAD=mandatory cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on \ -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 \ -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so \ - -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc + -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc \ + -DCMAKE_CXX_FLAGS="--offload-arch=sm_70" ``` ## Build Instructions for Perlmutter @@ -223,7 +224,7 @@ module load clang-16.0.6-omp-nvptx module load cudatoolkit source /global/homes/a/atif/packages/root_install/bin/thisroot.sh export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs -cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=off -DENABLE_OMPGPU=on -DCMAKE_CXX_COMPILER=clang++ -DINPUT_PATH="../../FastCaloSimInputs" -DCMAKE_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/ +cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=off -DENABLE_OMPGPU=on -DCMAKE_CXX_COMPILER=clang++ -DINPUT_PATH="../../FastCaloSimInputs" -DCMAKE_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/ -DCMAKE_CXX_FLAGS="--offload-arch=sm_80" ## Build Instructions for Cori diff --git a/scripts/script_build_gr_all.sh b/scripts/script_build_gr_all.sh new file mode 100644 index 0000000..00bd293 --- /dev/null +++ b/scripts/script_build_gr_all.sh @@ -0,0 +1,313 @@ +# CPU Exalearn5 + +# CUDA ----------------- +## Nvidia -------------- +### CURAND Exalearn5 +### CPURNG Exalearn5 + +# OpenMP --------------- +## Nvidia -------------- +### CURAND Exalearn5 +### CPURNG Exalearn5 +### OMPRNG +## AMD ----------------- +### HIPRAND xxxxxxxxx +### CPURNG Exalearn4 +### OMPRNG +## Multicore CPU -------- +### CPURNG Exalearn4 + +# HIP ------------------ +## Nvidia -------------- +### CURAND xxxxxxxxx +### CPURNG Perlmutter +## AMD ----------------- +### HIPRAND Exalearn4 +### CPURNG Exalearn4 + +# STDPAR --------------- +## Nvidia -------------- +### CURAND Exalearn5 +### CPURNG Exalearn5 +## Multicore ----------- +### CPURNG Exalearn5 +## CPU ----------------- +### CPURNG Exalearn5 + +# Alpaka --------------- +## Nvidia CUDA --------- +### CURAND Exalearn5 +### CPURNG Exalearn5 +## AMD HIP ------------- +### HIPRAND +### CPURNG + +# Kokkos --------------- +## Nvidia -------------- +### CURAND Exalearn5 +### CPURNG Exalearn5 + +# Edit this to exalearn4 or 5 accordingly +system="exalearn5" + +rm -rf build-exalearn4-* +rm -rf build-exalearn5-* + +if [ "$system" = "exalearn4" ]; then + source /global/home/users/fmohammad/packages/root-clang15/bin/thisroot.sh + export FCS_DATAPATH=/global/home/users/cgleggett/data/FastCaloSimInputs + module use /global/home/users/fmohammad/modulefiles/ + module load clang-18.0.0-gcc-8.5.0-omp-amdgcn +fi + +if [ "$system" = "exalearn5" ]; then + source /global/home/users/fmohammad/packages/root-clang15/bin/thisroot.sh + export FCS_DATAPATH=/global/home/users/cgleggett/data/FastCaloSimInputs + module use /global/home/users/fmohammad/modulefiles/ + #module load clang-15.0.6-gcc-8.5.0-omp-nvptx + module load clang-17.0.0-gcc-8.5.0-omp-nvptx +fi + +# # # # # # # # # # # # # # + +# CPU +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x CPU BUILD x-x-x-x-x" + mkdir -p build-exalearn5-cpu + cd build-exalearn5-cpu + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=off -DCMAKE_CXX_STANDARD=17 + make -j16 + echo "x-x-x-x-x CPU BUILD DONE! x-x-x-x-x" + cd .. +fi + +# # # # # # # # # # # # # # + +# CUDA +## Nvidia +### CURAND +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x CUDA CURAND BUILD x-x-x-x-x" + module load cuda/11.5 + mkdir -p build-exalearn5-cuda-curand + cd build-exalearn5-cuda-curand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DRNDGEN_CPU=Off -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CUDA_ARCHITECTURES=80 + make -j16 + echo "x-x-x-x-x CUDA CURAND BUILD DONE! x-x-x-x-x" + cd .. +fi +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x CUDA CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn5-cuda-cpurng + cd build-exalearn5-cuda-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DRNDGEN_CPU=On -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CUDA_ARCHITECTURES=80 + make -j16 + echo "x-x-x-x-x CUDA CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + +# # # # # # # # # # # # # # + +# OpenMP +## Nvidia +### CURAND +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x OpenMP Nvidia CURAND BUILD x-x-x-x-x" + export OMP_TARGET_OFFLOAD=mandatory + mkdir -p build-exalearn5-openmp-nv-curand + cd build-exalearn5-openmp-nv-curand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_CPU=Off -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_FLAGS="--offload-arch=sm_80" + make -j16 + echo "x-x-x-x-x OpenMP Nvidia CURAND BUILD DONE! x-x-x-x-x" + cd .. +fi +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x OpenMP Nvidia CPURNG BUILD x-x-x-x-x" + export OMP_TARGET_OFFLOAD=mandatory + mkdir -p build-exalearn5-openmp-nv-cpurng + cd build-exalearn5-openmp-nv-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_CPU=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_FLAGS="--offload-arch=sm_80" + make -j16 + echo "x-x-x-x-x OpenMP Nvidia CPURNG BUILD DONE x-x-x-x-x" + cd .. +fi + +# OpenMP +## AMD +### HIPRAND +# Port does not exist +### CPURNG +if [ "$system" = "exalearn4" ]; then + echo "x-x-x-x-x OpenMP AMD CPURNG BUILD x-x-x-x-x" + export OMP_TARGET_OFFLOAD=mandatory + mkdir -p build-exalearn4-openmp-amd-cpurng + cd build-exalearn4-openmp-amd-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_CPU=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_FLAGS="--offload-arch=gfx908" + make -j32 + echo "x-x-x-x-x OpenMP AMD CPURNG BUILD DONE x-x-x-x-x" + cd .. +fi + +## Multicore CPU +### CPURNG +if [ "$system" = "exalearn4" ]; then + echo "x-x-x-x-x OpenMP MULTICORE CPU CPURNG BUILD x-x-x-x-x" + export OMP_TARGET_OFFLOAD=disabled + mkdir -p build-exalearn4-openmp-multicorecpu-cpurng + cd build-exalearn4-openmp-multicorecpu-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_CPU=On -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_FLAGS="--offload-arch=gfx908" + make -j32 + echo "x-x-x-x-x OpenMP MULTICORE CPU CPURNG BUILD DONE x-x-x-x-x" + cd .. +fi + +# # # # # # # # # # # # # # + +# HIP +## Nvidia +### CURAND +### CPURNG + +## HIP +## AMD +### HIPRAND +if [ "$system" = "exalearn4" ]; then + echo "x-x-x-x-x HIP HIPRAND BUILD x-x-x-x-x" + mkdir -p build-exalearn4-hip-amd-hiprand + cd build-exalearn4-hip-amd-hiprand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=AMD -DRNDGEN_CPU=Off + make -j32 + echo "x-x-x-x-x HIP HIPRAND BUILD DONE! x-x-x-x-x" + cd .. +fi +### CPURNG +if [ "$system" = "exalearn4" ]; then + echo "x-x-x-x-x HIP CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn4-hip-amd-cpurng + cd build-exalearn4-hip-amd-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=AMD -DRNDGEN_CPU=On + make -j32 + echo "x-x-x-x-x HIP CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + +# # # # # # # # # # # # # # + +# STDPAR +## Nvidia +### CURAND +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x std::par Nvidia CURAND BUILD x-x-x-x-x" + module purge + root/6.24.06-gcc85-c17 + module load nvhpc/22.9 + module load cuda/11.5 + mkdir -p build-exalearn5-stdpar-nv-curand + cd build-exalearn5-stdpar-nv-curand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=/global/home/users/fmohammad/FCS-GPU//scripts/nvc++_p -DENABLE_GPU=on -DUSE_STDPAR=ON -DSTDPAR_TARGET=gpu -DCMAKE_CUDA_ARCHITECTURES=80 -DRNDGEN_CPU=Off + make -j16 + echo "x-x-x-x-x std::par Nvidia CURAND BUILD DONE x-x-x-x-x" + cd .. +fi + +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x std::par Nvidia CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn5-stdpar-nv-cpurng + cd build-exalearn5-stdpar-nv-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=/global/home/users/fmohammad/FCS-GPU//scripts/nvc++_p -DENABLE_GPU=on -DUSE_STDPAR=ON -DSTDPAR_TARGET=gpu -DCMAKE_CUDA_ARCHITECTURES=80 -DRNDGEN_CPU=On + make -j16 + echo "x-x-x-x-x std::par Nvidia CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + +## Multicore +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x std::par Multicore CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn5-stdpar-multicore + cd build-exalearn5-stdpar-multicore + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=/global/home/users/fmohammad/FCS-GPU//scripts/nvc++_p -DENABLE_GPU=on -DUSE_STDPAR=ON -DSTDPAR_TARGET=multicore -DCMAKE_CUDA_ARCHITECTURES=80 -DRNDGEN_CPU=On + make -j16 + echo "x-x-x-x-x std::par Multicore CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + +## CPU +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x std::par CPU CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn5-stdpar-cpu + cd build-exalearn5-stdpar-cpu + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=/global/home/users/fmohammad/FCS-GPU//scripts/nvc++_p -DENABLE_GPU=on -DUSE_STDPAR=ON -DSTDPAR_TARGET=cpu -DCMAKE_CUDA_ARCHITECTURES=80 -DRNDGEN_CPU=On + make -j16 + echo "x-x-x-x-x std::par CPU CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + +# # # # # # # # # # # # # # + +# Alpaka +## Nvidia CUDA +### CURAND +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x Alpaka Nvidia CURAND BUILD x-x-x-x-x" + module purge + root/6.24.06-gcc85-c17 + module load alpaka/0.9.0 + module load cuda/11.5 + mkdir -p build-exalearn5-alpaka-nv-curand + cd build-exalearn5-alpaka-nv-curand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DCMAKE_CXX_STANDARD=17 -DUSE_ALPAKA=on -Dalpaka_ROOT=/opt/alpaka/0.9.0/ -Dalpaka_ACC_GPU_CUDA_ENABLE=ON -Dalpaka_ACC_GPU_CUDA_ONLY_MODE=ON -DRNDGEN_CPU=Off -DCMAKE_CUDA_ARCHITECTURES=80 + make -j16 + echo "x-x-x-x-x Alpaka Nvidia CURAND BUILD DONE! x-x-x-x-x" + cd .. +fi + +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x Alpaka Nvidia CURAND BUILD x-x-x-x-x" + mkdir -p build-exalearn5-alpaka-nv-cpurng + cd build-exalearn5-alpaka-nv-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DCMAKE_CXX_STANDARD=17 -DUSE_ALPAKA=on -Dalpaka_ROOT=/opt/alpaka/0.9.0/ -Dalpaka_ACC_GPU_CUDA_ENABLE=ON -Dalpaka_ACC_GPU_CUDA_ONLY_MODE=ON -DRNDGEN_CPU=On -DCMAKE_CUDA_ARCHITECTURES=80 + make -j16 + echo "x-x-x-x-x Alpaka Nvidia CURAND BUILD DONE! x-x-x-x-x" + cd .. +fi + +## AMD HIP +### HIPRAND +### CPURNG + + +# # # # # # # # # # # # # # + +# Kokkos +## Nvidia +### CURAND +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x Kokkos Nvidia CURAND BUILD x-x-x-x-x" + module purge + root/6.24.06-gcc85-c17 + module load kokkos/4.1-cuda11.5-shlib + mkdir -p build-exalearn5-kokkos-nv-curand + cd build-exalearn5-kokkos-nv-curand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=nvcc_wrapper -DENABLE_GPU=on -DUSE_KOKKOS=ON -DRNDGEN_CPU=Off + make -j16 + echo "x-x-x-x-x Kokkos Nvidia CURAND BUILD DONE! x-x-x-x-x" + cd .. +fi +### CPURNG +if [ "$system" = "exalearn5" ]; then + echo "x-x-x-x-x Kokkos Nvidia CPURNG BUILD x-x-x-x-x" + mkdir -p build-exalearn5-kokkos-nv-cpurng + cd build-exalearn5-kokkos-nv-cpurng + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=Off -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DCMAKE_CXX_COMPILER=nvcc_wrapper -DENABLE_GPU=on -DUSE_KOKKOS=ON -DRNDGEN_CPU=On + make -j16 + echo "x-x-x-x-x Kokkos Nvidia CPURNG BUILD DONE! x-x-x-x-x" + cd .. +fi + + From 82a1fceadf3acd17b088054e3aa85dd312dd1638 Mon Sep 17 00:00:00 2001 From: Mohammad Atif Date: Thu, 10 Oct 2024 14:36:52 -0400 Subject: [PATCH 20/25] fixed random123 issues --- .../FastCaloGpu/src/Rand4Hits_omp.cxx | 27 ++++++++++++++++--- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 63c5a13..9ce708a 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -1,13 +1,10 @@ /* Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration */ -// #include "gpuQ.h" #include "Rand4Hits.h" #include "DEV_BigMem.h" #include -// #include -// #include #include "openmp_rng.h" #include "GpuParams.h" @@ -86,8 +83,18 @@ void Rand4Hits::rd_regen() { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { - auto gen = generator_enum::xorwow; + auto gen = generator_enum::xorwow; +#ifdef USE_RANDOM123 + float* f_r123 = (float*) malloc ( 3 * m_total_a_hits * sizeof( float ) ); + omp_get_rng_uniform_float(f_r123, 3 * m_total_a_hits, m_seed, gen); + if ( omp_target_memcpy( m_rand_ptr, f_r123, 3 * m_total_a_hits * sizeof( float ), m_offset, m_offset, m_select_device, + m_initial_device ) ) { + std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; + } + free(f_r123); +#else omp_get_rng_uniform_float(m_rand_ptr, 3 * m_total_a_hits, m_seed, gen); +#endif //CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); } }; @@ -109,9 +116,21 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { +#ifdef USE_RANDOM123 + f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device ); + float* f_r123 = (float*) malloc ( num * sizeof( float ) ); + auto gen = generator_enum::xorwow; + omp_get_rng_uniform_float(f_r123, num, seed, gen); + if ( omp_target_memcpy( f, f_r123, num * sizeof( float ), m_offset, m_offset, m_select_device, + m_initial_device ) ) { + std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; + } + free(f_r123); +#else f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device ); auto gen = generator_enum::xorwow; omp_get_rng_uniform_float(f, num, seed, gen); +#endif m_gen = (void*)gen; // We need to save the seed for rd_regen m_seed = seed; From a47ae2752b0cbb2082beedb4b501a1b52de3dac4 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Thu, 10 Oct 2024 14:56:52 -0700 Subject: [PATCH 21/25] fixed remaining conflicts --- FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx | 2 +- README.md | 8 -------- 2 files changed, 1 insertion(+), 9 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 0607c04..9463432 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -88,9 +88,9 @@ void Rand4Hits::rd_regen() { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { - auto gen = generator_enum::xorwow; #ifdef RNDGEN_OMP #ifdef USE_RANDOM123 + auto gen = generator_enum::xorwow; float* f_r123 = (float*) malloc ( 3 * m_total_a_hits * sizeof( float ) ); omp_get_rng_uniform_float(f_r123, 3 * m_total_a_hits, m_seed, gen); if ( omp_target_memcpy( m_rand_ptr, f_r123, 3 * m_total_a_hits * sizeof( float ), m_offset, m_offset, m_select_device, diff --git a/README.md b/README.md index 3841904..f3669b8 100644 --- a/README.md +++ b/README.md @@ -193,15 +193,11 @@ export HIP_RUNTIME=cuda module load hip export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs source /global/homes/a/atif/packages/root_install/bin/thisroot.sh -<<<<<<< HEAD -cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" -DRNDGEN_CPU=on -======= cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc \ -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia \ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on \ -DHIP_TARGET=NVIDIA -DRNDGEN_CPU=on \ -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" ->>>>>>> group_sim_combined ``` ### alpaka @@ -228,11 +224,7 @@ module load clang-16.0.6-omp-nvptx module load cudatoolkit source /global/homes/a/atif/packages/root_install/bin/thisroot.sh export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs -<<<<<<< HEAD -cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=off -DENABLE_OMPGPU=on -DCMAKE_CXX_COMPILER=clang++ -DINPUT_PATH="../../FastCaloSimInputs" -DCMAKE_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/ -======= cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=off -DENABLE_OMPGPU=on -DCMAKE_CXX_COMPILER=clang++ -DINPUT_PATH="../../FastCaloSimInputs" -DCMAKE_LIBRARY_PATH=/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/ -DCMAKE_CXX_FLAGS="--offload-arch=sm_80" ->>>>>>> group_sim_combined ## Build Instructions for Cori From 3f97c17e19573992fa96435615b0aba9c73c67c0 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Wed, 16 Oct 2024 19:14:22 -0700 Subject: [PATCH 22/25] added rocrand for openmp on amd devices --- FastCaloSimAnalyzer/CMakeLists.txt | 1 - .../FastCaloGpu/src/CMakeLists.txt | 9 ++++-- .../FastCaloGpu/src/Rand4Hits_omp.cxx | 29 +++++++++++++++++++ scripts/script_build_gr_all.sh | 15 ++++++++-- 4 files changed, 48 insertions(+), 6 deletions(-) diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index 9e96252..b7c9427 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -23,7 +23,6 @@ set(USE_ALPAKA OFF CACHE BOOL "Use alpaka") set(USE_HIP OFF CACHE BOOL "Use HIP") set(HIP_TARGET "AMD" CACHE STRING "HIP backend. must be either AMD or NVIDIA") - if ( USE_STDPAR ) if ( ${STDPAR_TARGET} STREQUAL "cpu" ) if ( NOT RNDGEN_CPU ) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index fe3eec5..28f069a 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -140,7 +140,13 @@ elseif(ENABLE_OMPGPU) target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() if(OMP_OFFLOAD_TARGET_AMD GREATER 0) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -D__HIP_PLATFORM_AMD__) target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DOMP_OFFLOAD_TARGET_AMD) + find_package(HIP REQUIRED) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/include ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/librocrand.so) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libamdhip64.so) endif() else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) @@ -183,9 +189,6 @@ else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) endif() elseif(ENABLE_OMPGPU) - if(OMP_OFFLOAD_TARGET_AMD GREATER 0) - message(FATAL_ERROR "when OMP TARGET OFFLOAD to AMD, RNDGEN_CPU must be ON") - endif() if(ENV{OMP_TARGET_OFFLOAD} MATCHES "disabled") message(FATAL_ERROR "when OMP_TARGET_OFFLOAD disabled, RNDGEN_CPU must be ON") endif() diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index a1a4b83..5e9ffd2 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -9,16 +9,27 @@ #include "gpuQ.h" #include #include +#elif defined OMP_OFFLOAD_TARGET_AMD +#include "hip/hip_runtime.h" +#include #endif #include "GpuParams.h" #include "Rand4Hits_cpu.cxx" +#ifdef OMP_OFFLOAD_TARGET_NVIDIA #define CURAND_CALL( x ) \ if ( ( x ) != CURAND_STATUS_SUCCESS ) { \ printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \ exit( EXIT_FAILURE ); \ } +#elif defined OMP_OFFLOAD_TARGET_AMD +#define ROCRAND_CALL( x ) \ + if ((x) != ROCRAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } +#endif void Rand4Hits::allocate_simulation( int maxbins, int maxhitct, unsigned long n_cells ) { @@ -73,9 +84,14 @@ Rand4Hits::~Rand4Hits() { if ( m_useCPU ) { destroyCPUGen(); } else { +#ifndef RNDGEN_CPU #ifdef OMP_OFFLOAD_TARGET_NVIDIA CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) ); delete (curandGenerator_t*)m_gen; +#elif defined OMP_OFFLOAD_TARGET_AMD + ROCRAND_CALL(rocrand_destroy_generator( *( (rocrand_generator*)m_gen))); + delete (rocrand_generator *)m_gen; +#endif #endif } }; @@ -88,8 +104,12 @@ void Rand4Hits::rd_regen() { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { +#ifndef RNDGEN_CPU #ifdef OMP_OFFLOAD_TARGET_NVIDIA CURAND_CALL( curandGenerateUniform( *( (curandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); +#elif defined OMP_OFFLOAD_TARGET_AMD + ROCRAND_CALL(rocrand_generate_uniform( *( (rocrand_generator*)m_gen), m_rand_ptr, 3 * m_total_a_hits)); +#endif #endif } }; @@ -111,6 +131,7 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { std::cout << "ERROR: copy random numbers from cpu to gpu " << std::endl; } } else { +#ifndef RNDGEN_CPU #ifdef OMP_OFFLOAD_TARGET_NVIDIA gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); curandGenerator_t* gen = new curandGenerator_t; @@ -118,6 +139,14 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { CURAND_CALL( curandSetPseudoRandomGeneratorSeed( *gen, seed ) ); CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); m_gen = (void*)gen; +#elif defined OMP_OFFLOAD_TARGET_AMD + hipMalloc(&f, num * sizeof(float)); + rocrand_generator* gen = new rocrand_generator; + ROCRAND_CALL(rocrand_create_generator(gen, ROCRAND_RNG_PSEUDO_DEFAULT)); + ROCRAND_CALL(rocrand_set_seed(*gen, seed)); + ROCRAND_CALL(rocrand_generate_uniform(*gen, f, num)); + m_gen = (void*)gen; +#endif #endif } diff --git a/scripts/script_build_gr_all.sh b/scripts/script_build_gr_all.sh index 00bd293..e2432f8 100644 --- a/scripts/script_build_gr_all.sh +++ b/scripts/script_build_gr_all.sh @@ -11,7 +11,7 @@ ### CPURNG Exalearn5 ### OMPRNG ## AMD ----------------- -### HIPRAND xxxxxxxxx +### ROCRAND Exalearn4 ### CPURNG Exalearn4 ### OMPRNG ## Multicore CPU -------- @@ -136,7 +136,18 @@ fi # OpenMP ## AMD -### HIPRAND +### ROCRAND +if [ "$system" = "exalearn4" ]; then + echo "x-x-x-x-x OpenMP AMD CPURNG BUILD x-x-x-x-x" + export OMP_TARGET_OFFLOAD=mandatory + export ROCM_PATH=/opt/rocm/ + mkdir -p build-exalearn4-openmp-amd-rocrand + cd build-exalearn4-openmp-amd-rocrand + cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_CPU=Off -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_FLAGS="--offload-arch=gfx908" + make -j32 + echo "x-x-x-x-x OpenMP AMD CPURNG BUILD DONE x-x-x-x-x" + cd .. +fi # Port does not exist ### CPURNG if [ "$system" = "exalearn4" ]; then From 426850d6cf28f93dc0bdc448f0bbd1b1b45320c1 Mon Sep 17 00:00:00 2001 From: atif4461 Date: Mon, 21 Oct 2024 09:53:56 -0700 Subject: [PATCH 23/25] merged portable_rng, tested exalearn 4,5 --- readme_omp_rng.md | 17 ----------------- 1 file changed, 17 deletions(-) delete mode 100644 readme_omp_rng.md diff --git a/readme_omp_rng.md b/readme_omp_rng.md deleted file mode 100644 index 1816ec7..0000000 --- a/readme_omp_rng.md +++ /dev/null @@ -1,17 +0,0 @@ - -## Build Instructions for alpha/lambda @ CSI, BNL -Change OMP_RNG path in FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt -according to your location of -git clone https://github.com/GKNB/test-benchmark-OpenMP-atomic.git - -``` -module use /work/software/modulefiles -module load llvm-openmp-dev -source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh -export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ -export OMP_TARGET_OFFLOAD=mandatory -cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_CUDA -I/usr/local/cuda/include" -``` - -# For AMD -cmake ../FastCaloSimAnalyzer -DENABLE_XROOTD=off -DENABLE_GPU=on -DENABLE_OMPGPU=on -DRNDGEN_OMP=on -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CXX_STANDARD=14 -DCUDA_CUDART_LIBRARY=/usr/local/cuda/lib64/libcudart.so -DCUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda/ -DCMAKE_CUDA_COMPILER=/usr/local/cuda/bin/nvcc -DCMAKE_CXX_FLAGS="-DARCH_HIP -I/opt/rocm/include -L/opt/rocm/rocrand/lib/ -lrocrand" From 6653e2165de18e7c92c9e2e9ef343a8445e4d75f Mon Sep 17 00:00:00 2001 From: atif4461 Date: Mon, 21 Oct 2024 11:53:15 -0700 Subject: [PATCH 24/25] fixed omp rng bugs --- FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt | 2 +- .../FastCaloGpu/src/Rand4Hits_omp.cxx | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 08fd346..37a6bbe 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -181,6 +181,7 @@ if(RNDGEN_CPU) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU ) elseif(RNDGEN_OMP) message(STATUS "Will generate random numbers using Portable OpenMP RNG Library") + target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_OMP ) if(NOT DEFINED OMPRNG_HOME) include(FetchContent) set(FETCHCONTENT_QUIET OFF) @@ -194,7 +195,6 @@ elseif(RNDGEN_OMP) else() include_directories(${OMPRNG_HOME}) include_directories(${OMPRNG_HOME}/implementation) - target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_OMP ) endif() if(ARCH_CUDA) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DARCH_CUDA ) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx index 449cf14..89db2ad 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_omp.cxx @@ -80,11 +80,13 @@ Rand4Hits::~Rand4Hits() { << " lost: " << DEV_BigMem::bm_ptr->lost() << std::endl; delete DEV_BigMem::bm_ptr; } +#ifdef RNDGEN_OMP omp_target_free( m_rand_ptr, m_select_device ); +#endif if ( m_useCPU ) { destroyCPUGen(); } else { -#ifndef RNDGEN_CPU +#ifndef RNDGEN_OMP #ifndef USE_RANDOM123 #ifdef OMP_OFFLOAD_TARGET_NVIDIA CURAND_CALL( curandDestroyGenerator( *( (curandGenerator_t*)m_gen ) ) ); @@ -107,8 +109,8 @@ void Rand4Hits::rd_regen() { } } else { #ifdef RNDGEN_OMP -# ifdef USE_RANDOM123 auto gen = generator_enum::xorwow; +# ifdef USE_RANDOM123 float* f_r123 = (float*) malloc ( 3 * m_total_a_hits * sizeof( float ) ); omp_get_rng_uniform_float(f_r123, 3 * m_total_a_hits, m_seed, gen); if ( omp_target_memcpy( m_rand_ptr, f_r123, 3 * m_total_a_hits * sizeof( float ), m_offset, m_offset, m_select_device, @@ -149,10 +151,10 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { } } else { #ifdef RNDGEN_OMP - #ifdef USE_RANDOM123 f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device ); - float* f_r123 = (float*) malloc ( num * sizeof( float ) ); auto gen = generator_enum::xorwow; + #ifdef USE_RANDOM123 + float* f_r123 = (float*) malloc ( num * sizeof( float ) ); omp_get_rng_uniform_float(f_r123, num, seed, gen); if ( omp_target_memcpy( f, f_r123, num * sizeof( float ), m_offset, m_offset, m_select_device, m_initial_device ) ) { @@ -160,8 +162,6 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { } free(f_r123); #else - f = (float*)omp_target_alloc( num * sizeof( float ), m_select_device ); - auto gen = generator_enum::xorwow; omp_get_rng_uniform_float(f, num, seed, gen); #endif m_gen = (void*)gen; From cf986a7f4948de681ee0d4b929731937078d8e1f Mon Sep 17 00:00:00 2001 From: atif4461 Date: Tue, 10 Dec 2024 18:50:24 -0800 Subject: [PATCH 25/25] added dockerfile llvm amdgpu --- scripts/fcs-llvm-amdgpu.Dockerfile | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 scripts/fcs-llvm-amdgpu.Dockerfile diff --git a/scripts/fcs-llvm-amdgpu.Dockerfile b/scripts/fcs-llvm-amdgpu.Dockerfile new file mode 100644 index 0000000..e14b9b0 --- /dev/null +++ b/scripts/fcs-llvm-amdgpu.Dockerfile @@ -0,0 +1,26 @@ +FROM dingpf/fcs-rocm + +USER root + +RUN \ + cd /hep-mini-apps && \ + mkdir -p llvm-amdgpu && \ + git clone --depth 1 --branch llvmorg-19.1.0 https://github.com/llvm/llvm-project.git && \ + cd llvm-project && \ + mkdir -p build && \ + cd build && \ + cmake -G "Unix Makefiles" \ + -B /hep-mini-apps/llvm-project/build/ \ + -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;lldb;compiler-rt" \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ \ + -DLLVM_ENABLE_RUNTIMES:STRING="openmp;offload" \ + -DCLANG_DEFAULT_OPENMP_RUNTIME:STRING=libomp \ + -DCMAKE_INSTALL_PREFIX=/hep-mini-apps/llvm-amdgpu \ + -DLLVM_TARGETS_TO_BUILD:STRING="X86;AMDGPU" \ + -DLIBOMPTARGET_DEVICE_ARCHITECTURES="gfx906;gfx908;gfx90a" \ + /hep-mini-apps/llvm-project/llvm && \ + make -j128 && \ + make install && \ + cd ../../ && \ + rm -rf llvm-project