diff --git a/FastCaloSimAnalyzer/CMakeLists.txt b/FastCaloSimAnalyzer/CMakeLists.txt index de33cc6..ae1bdeb 100644 --- a/FastCaloSimAnalyzer/CMakeLists.txt +++ b/FastCaloSimAnalyzer/CMakeLists.txt @@ -27,7 +27,9 @@ if ( USE_STDPAR ) SET(CUDA_USE_STATIC_CUDA_RUNTIME OFF) if ( ${STDPAR_TARGET} STREQUAL "cpu" ) if ( NOT RNDGEN_CPU ) - message(FATAL_ERROR "when STDPAR_TARGET=cpu, RNDGEN_CPU must be ON") + message(WARNING "when STDPAR_TARGET=cpu, RNDGEN_CPU must be ON") + message(WARNING "Setting RNDGEN_CPU to ON") + set( RNDGEN_CPU ON ) endif() set(STDPAR_DIRECTIVE "-nostdpar") elseif( ${STDPAR_TARGET} STREQUAL "gpu" ) @@ -38,6 +40,11 @@ if ( USE_STDPAR ) message(WARNING "Setting USE_ATOMIC_ADD to OFF") set ( USE_ATOMIC_ADD OFF ) endif() + if ( NOT RNDGEN_CPU ) + message(WARNING "when STDPAR_TARGET=multicore, RNDGEN_CPU must be ON") + message(WARNING "Setting RNDGEN_CPU to ON") + set( RNDGEN_CPU ON ) + endif() set(STDPAR_DIRECTIVE "-stdpar=multicore") else() message(FATAL_ERROR "unknown stdpar target ${STDPAR_TARGET}") @@ -51,7 +58,15 @@ elseif(USE_ALPAKA) find_package(alpaka REQUIRED) elseif(USE_HIP) - find_package(HIP REQUIRED) + if ( ${HIP_TARGET} STREQUAL "NVIDIA" ) + find_package(HIP) + if ( NOT RNDGEN_CPU ) + message(FATAL_ERROR "when HIP_TARGET=NVIDIA, RNDGEN_CPU must be ON") + endif() + else() + find_package(HIP REQUIRED) + endif() + endif() include(FastCaloSim) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/CountingIterator.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/CountingIterator.h index 02f9fd5..77e8066 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/CountingIterator.h +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/CountingIterator.h @@ -1,3 +1,10 @@ +#ifndef COUNTING_ITERATOR_H +#define COUNTING_ITERATOR_H 1 + +#include +#include +#include + struct counting_iterator { typedef size_t Index_t; @@ -67,3 +74,6 @@ struct counting_iterator { private: value_type value; }; + + +#endif diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h index a7afba8..69614d0 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/HostDevDef.h @@ -18,6 +18,9 @@ # define __HOSTDEV__ # define __INLINE__ inline #elif defined(USE_HIP) +# if defined(HIP_TARGET_NVIDIA) +# include "cuda_runtime.h" +# endif # define __DEVICE__ __device__ # define __HOST__ __host__ # define __HOSTDEV__ __host__ __device__ diff --git a/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/TestStdPar.h b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/TestStdPar.h new file mode 100644 index 0000000..1b15cbd --- /dev/null +++ b/FastCaloSimAnalyzer/FastCaloGpu/FastCaloGpu/TestStdPar.h @@ -0,0 +1,18 @@ +#ifndef FCS_TEST_STDPAR +#define FCS_TEST_STDPAR 1 + +class TestStdPar { + +public: + + void testAll(unsigned long); + + void test_floatArray(unsigned long); + void test_vecInt(unsigned long); + void test_vecFloat(unsigned long); + void test_atomicAdd_int(unsigned long); + void test_atomicAdd_float(unsigned long); + +}; + +#endif diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt index 3dfda73..5ec1310 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CMakeLists.txt @@ -87,8 +87,15 @@ endif() # Sources if(USE_STDPAR) - set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_sp.cxx gpuQ.cu Rand4Hits.cu Rand4Hits_sp.cxx ) -# set(FastCaloGpu_Srcs gpuQ.cu CaloGpuGeneral.cxx KernelWrapper.cu Rand4Hits.cu ) + set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_sp.cxx Rand4Hits_sp.cxx TestStdPar.cxx) + if ( ${STDPAR_TARGET} STREQUAL "gpu" ) + set(FastCaloGpu_Srcs ${FastCaloGpu_Srcs} gpuQ.cu ) + endif() + if ( RNDGEN_CPU ) + set(FastCaloGpu_Srcs ${FastCaloGpu_Srcs} Rand4Hits_cpu.cxx ) + else() + set(FastCaloGpu_Srcs ${FastCaloGpu_Srcs} Rand4Hits.cu ) + endif() elseif(USE_KOKKOS) set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_kk.cxx ) elseif(USE_ALPAKA) @@ -96,18 +103,27 @@ elseif(USE_ALPAKA) elseif(ENABLE_OMPGPU) set(FastCaloGpu_Srcs KernelWrapper_omp.cxx) elseif(USE_HIP) - include_directories( /opt/rocm/hip/include ) + # Define ROCM_PATH if not defined + if (NOT DEFINED ENV{ROCM_PATH}) + message(WARNING "Environment variable ROCM_PATH not set! Using default /opt/rocm/") + set(ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory.") + else() + set(ROCM_PATH $ENV{ROCM_PATH}) + endif() + include_directories( ${ROCM_PATH}/hip/include ) + if ( ${HIP_TARGET} STREQUAL "AMD" ) message(STATUS " Using AMD HIP backend") - add_compile_definitions(__HIP_PLATFORM_AMD__ HIP_PLATFORM_AMD) + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) message(STATUS " Using NVIDIA HIP backend") - add_compile_definitions(__HIP_PLATFORM_NVIDIA__ HIP_PLATFORM_NVIDIA) + find_package(CUDAToolkit REQUIRED) + set(CMAKE_CUDA_ARCHITECTURES "70;75;80;86") + set(CMAKE_HIP_ARCHITECTURES "gfx90a;gfx906;gfx908") else() message(FATAL_ERROR "unknown HIP_TARGET=${HIP_TARGET}. Must be either AMD or NVIDIA") endif() - add_compile_definitions(__HIP_PLATFORM_HCC__ HIP_PLATFORM_HCC) - + set(FastCaloGpu_Srcs GeoLoadGpu.cxx KernelWrapper_hip.cxx gpuQ_hip.cxx Rand4Hits_hip.cxx ) else() @@ -144,10 +160,13 @@ if(USE_ALPAKA) elseif(USE_HIP) target_compile_definitions(${FastCaloGpu_LIB} PRIVATE ${FCS_CommonDefinitions}) target_include_directories(${FastCaloGpu_LIB} PRIVATE ../FastCaloGpu/ } ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/hiprand/include ) - target_include_directories(${FastCaloGpu_LIB} PRIVATE /opt/rocm/rocrand/include ) - - target_link_libraries(${FastCaloGpu_LIB} PUBLIC /opt/rocm/lib/libhiprand.so) + + if ( ${HIP_TARGET} STREQUAL "AMD" ) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_AMD) + elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) + target_compile_definitions(${FastCaloGpu_LIB} PUBLIC -DHIP_TARGET_NVIDIA) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC CUDA::cudart) + endif() else() target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${NVTOOLSEXT_LIB} ${CURAND_LIB}) endif() @@ -184,13 +203,24 @@ if(USE_STDPAR) endif() - target_link_options(${FastCaloGpu_LIB} PRIVATE ${STDPAR_DIRECTIVE}) + target_link_options(${FastCaloGpu_LIB} PRIVATE ${STDPAR_DIRECTIVE} -Xlinker -z noexecstack) endif() if(RNDGEN_CPU) message(STATUS "Will generate random numbers on CPU") target_compile_definitions(${FastCaloGpu_LIB} PRIVATE -DRNDGEN_CPU ) + # TODO Link a portable RNG library +else() + if(USE_HIP) + if ( ${HIP_TARGET} STREQUAL "AMD" ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/hiprand/include ) + target_include_directories(${FastCaloGpu_LIB} PRIVATE ${ROCM_PATH}/rocrand/include ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${ROCM_PATH}/lib/libhiprand.so) + elseif( ${HIP_TARGET} STREQUAL "NVIDIA" ) + target_link_libraries(${FastCaloGpu_LIB} PUBLIC ${CUDA_curand_LIBRARY} ${CUDA_nvToolsExt_LIBRARY}) + endif() + endif() endif() if(DUMP_HITCELLS) diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx index 86f6ad7..8cbacc1 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral.cxx @@ -69,9 +69,9 @@ void* CaloGpuGeneral::Rand4Hits_init( long long maxhits, unsigned short maxbin, std::cout << "using alpaka\n"; #elif defined(USE_HIP) std::cout << "using HIP on "; - #ifdef __HIP_PLATFORM_NVIDIA__ + #ifdef HIP_TARGET_NVIDIA std::cout << "NVIDIA\n"; - #elif defined __HIP_PLATFORM_AMD__ + #elif defined HIP_TARGET_AMD std::cout << "AMD\n"; #else std::cout << "UNKNOWN\n"; diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_sp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_sp.cxx index e12cb7b..b5ddc64 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_sp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/CaloGpuGeneral_sp.cxx @@ -9,7 +9,11 @@ #include "Rand4Hits.h" #include "Hit.h" #include "CountingIterator.h" -#include "nvToolsExt.h" + +// FIXME: Bug in nvhpc 24.X +#if defined ( _NVHPC_STDPAR_NONE ) + #include "nvToolsExt.h" +#endif #define DO_ATOMIC_TESTS 0 diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits.cu b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits.cu index 2adee05..5fed636 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits.cu +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits.cu @@ -52,13 +52,13 @@ void Rand4Hits::allocateGenMem(size_t num) { Rand4Hits::~Rand4Hits() { - + #ifdef USE_STDPAR deallocate(); #else delete ( m_rnd_cpu ); #endif - + #ifdef USE_STDPAR if (!m_useCPU) { gpuQ( cudaFree( m_rand_ptr ) ); @@ -66,7 +66,7 @@ Rand4Hits::~Rand4Hits() { #else gpuQ( cudaFree( m_rand_ptr ) ); #endif - + if ( m_useCPU ) { destroyCPUGen(); } else { @@ -91,7 +91,7 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { float* f{nullptr}; m_useCPU = useCPU; - + if ( m_useCPU ) { allocateGenMem( num ); createCPUGen( seed ); @@ -110,9 +110,9 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { CURAND_CALL( curandGenerateUniform( *gen, f, num ) ); m_gen = (void*)gen; } - + m_rand_ptr = f; - + std::cout << "R4H m_rand_ptr: " << m_rand_ptr << std::endl; } diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_cpu.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_cpu.cxx index fcd88f7..ec4cbae 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_cpu.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_cpu.cxx @@ -1,7 +1,7 @@ /* Copyright (C) 2002-2021 CERN for the benefit of the ATLAS collaboration */ - +#include "Rand4Hits.h" #include #include #include diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx index a302e37..7a08181 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_hip.cxx @@ -5,15 +5,30 @@ #include "Rand4Hits.h" #include "gpuQ.h" #include + +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) +#include +#else #include +#endif +#endif #include "Rand4Hits_cpu.cxx" -#define CURAND_CALL( x ) \ - if ( ( x ) != HIPRAND_STATUS_SUCCESS ) { \ - printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \ - exit( EXIT_FAILURE ); \ - } +#if defined (HIP_TARGET_NVIDIA) + #define CURAND_CALL(x) \ + if ((x) != CURAND_STATUS_SUCCESS) { \ + printf("Error at %s:%d\n", __FILE__, __LINE__); \ + exit(EXIT_FAILURE); \ + } +#else + #define CURAND_CALL( x ) \ + if ( ( x ) != HIPRAND_STATUS_SUCCESS ) { \ + printf( "Error at %s:%d\n", __FILE__, __LINE__ ); \ + exit( EXIT_FAILURE ); \ + } +#endif #ifndef USE_STDPAR void Rand4Hits::allocate_simulation( long long /*maxhits*/, unsigned short /*maxbins*/, unsigned short maxhitct, @@ -70,8 +85,15 @@ Rand4Hits::~Rand4Hits() { if ( m_useCPU ) { destroyCPUGen(); } else { +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) + CURAND_CALL(curandDestroyGenerator(*((curandGenerator_t *)m_gen))); + delete (curandGenerator_t *)m_gen; +#else CURAND_CALL( hiprandDestroyGenerator( *( (hiprandGenerator_t*)m_gen ) ) ); delete (hiprandGenerator_t*)m_gen; +#endif +#endif } }; @@ -82,7 +104,14 @@ void Rand4Hits::rd_regen() { gpuQ( hipMemcpy( m_rand_ptr, m_rnd_cpu->data(), 3 * m_total_a_hits * sizeof( float ), hipMemcpyHostToDevice ) ); #endif } else { +#ifndef RNDGEN_CPU +#if defined (HIP_TARGET_NVIDIA) + CURAND_CALL(curandGenerateUniform(*((curandGenerator_t *)m_gen), m_rand_ptr, + 3 * m_total_a_hits)); +#else CURAND_CALL( hiprandGenerateUniform( *( (hiprandGenerator_t*)m_gen ), m_rand_ptr, 3 * m_total_a_hits ) ); +#endif +#endif } }; @@ -103,12 +132,21 @@ void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { gpuQ( hipMemcpy( f, m_rnd_cpu->data(), num * sizeof( float ), hipMemcpyHostToDevice ) ); #endif } else { +#ifndef RNDGEN_CPU gpuQ( hipMalloc( &f, num * sizeof( float ) ) ); +#if defined (HIP_TARGET_NVIDIA) + curandGenerator_t *gen = new curandGenerator_t; + CURAND_CALL(curandCreateGenerator(gen, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CALL(curandSetPseudoRandomGeneratorSeed(*gen, seed)); + CURAND_CALL(curandGenerateUniform(*gen, f, num)); +#else hiprandGenerator_t* gen = new hiprandGenerator_t; CURAND_CALL( hiprandCreateGenerator( gen, HIPRAND_RNG_PSEUDO_DEFAULT ) ); CURAND_CALL( hiprandSetPseudoRandomGeneratorSeed( *gen, seed ) ); CURAND_CALL( hiprandGenerateUniform( *gen, f, num ) ); +#endif m_gen = (void*)gen; +#endif } m_rand_ptr = f; diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_sp.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_sp.cxx index 7af949d..186251b 100644 --- a/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_sp.cxx +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/Rand4Hits_sp.cxx @@ -2,6 +2,10 @@ #include #include +#ifndef RNDGEN_CPU +#include "gpuQ.h" +#endif + void Rand4Hits::allocate_simulation( long long /*maxhits*/, unsigned short /*maxbins*/, unsigned short maxhitct, unsigned long n_cells ) { @@ -35,3 +39,47 @@ void Rand4Hits::deallocate() { delete ( m_rnd_cpu ); } +/* +**** these are also defined in Rand4Hits.cu +*/ +//#ifndef _NVHPC_STDPAR_GPU +#ifdef RNDGEN_CPU + +Rand4Hits::~Rand4Hits() { + deallocate(); + + destroyCPUGen(); +} + +void Rand4Hits::rd_regen() { + genCPU( 3 * m_total_a_hits ); +}; + +void Rand4Hits::create_gen( unsigned long long seed, size_t num, bool useCPU ) { + + float* f{nullptr}; + + m_useCPU = useCPU; + + if ( m_useCPU ) { + allocateGenMem( num ); + createCPUGen( seed ); + genCPU( num ); +#ifdef USE_STDPAR + f = m_rnd_cpu->data(); +#else + gpuQ( cudaMalloc( &f, num * sizeof( float ) ) ); + gpuQ( cudaMemcpy( f, m_rnd_cpu->data(), num * sizeof( float ), cudaMemcpyHostToDevice ) ); +#endif + } else { + std::cout << "ERROR: should only be using CPU for Random Number Generator\n"; + throw std::runtime_error( "Rand4Hits::create_gen CPU ERROR: should only be using CPU for Random Number Generator\n" ); + } + + m_rand_ptr = f; + + std::cout << "R4H m_rand_ptr: " << m_rand_ptr << std::endl; + +} + +#endif diff --git a/FastCaloSimAnalyzer/FastCaloGpu/src/TestStdPar.cxx b/FastCaloSimAnalyzer/FastCaloGpu/src/TestStdPar.cxx new file mode 100644 index 0000000..fa0876f --- /dev/null +++ b/FastCaloSimAnalyzer/FastCaloGpu/src/TestStdPar.cxx @@ -0,0 +1,180 @@ +#include "TestStdPar.h" +#include "CountingIterator.h" + +#include +#include +#include +#include +#include + +void TestStdPar::testAll(unsigned long num) { + test_vecInt(num); + test_vecFloat(num); + test_atomicAdd_int(num); + test_atomicAdd_float(num); +} + +void TestStdPar::test_floatArray(unsigned long num) { + + std::cout << "---------- test_floatArray( " << num << " ) -------------\n"; + + double sum(0.); + float *pf = new float[num]; + for (int i=0; i( " << num << " ) -------------\n"; + + double sum(0.); + std::vector* pvec = new std::vector; + pvec->resize(num); + + float* pdat = pvec->data(); + + for (int i=0; i( " << num << " ) -------------\n"; + + long int sum(0); + std::vector* pvec = new std::vector; + pvec->resize(num); + + int* pdat = pvec->data(); + + for (int i=0; i(num) -------------\n"; + + } + + + +void TestStdPar::test_atomicAdd_int(unsigned long num) { + std::cout << "---------- test_atomic_add -------------\n"; + std::atomic *ii = new std::atomic{0}; + std::for_each_n(std::execution::par_unseq, counting_iterator(0), num, + [=](int i) { + int j = (*ii)++; + printf("%d %d\n",i,j); + } ); + std::cout << " after loop: " << *ii << " (should be " << num << ")" <_add -------------\n\n"; + } + + /* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * */ + +void TestStdPar::test_atomicAdd_float(unsigned long N) { + std::cout << "---------- test_atomicAdd_float -------------\n"; + + float ta[N]{0.}, tc[N]{0.}; + for (int i=0; i + +void TFCSStdParTest::test(bool doAtomic, bool doVector, unsigned long num) { + + TestStdPar tst; + + if (doAtomic) { + tst.test_atomicAdd_int(num); + tst.test_atomicAdd_float(num); + } + + if (doVector) { + tst.test_floatArray(num); + tst.test_vecFloat(num); + tst.test_vecInt(num); + } + +} diff --git a/FastCaloSimAnalyzer/macro/CMakeLists.txt b/FastCaloSimAnalyzer/macro/CMakeLists.txt index dac782f..b6d41a4 100644 --- a/FastCaloSimAnalyzer/macro/CMakeLists.txt +++ b/FastCaloSimAnalyzer/macro/CMakeLists.txt @@ -47,3 +47,13 @@ fcs_make_task(runTFCSSimulation DEPENDENCY ${AthenaStandalone_LIB} DEPENDENCY ${FastCaloSimAnalyzer_LIB} ) + +if(USE_STDPAR) + message(STATUS "Building StdPar tests") + fcs_make_task(runTFCSStdParTest + SOURCE runTFCSStdParTest.cxx + DEPENDENCY ${FastCaloSimCommon_LIB} + DEPENDENCY ${AthenaStandalone_LIB} + DEPENDENCY ${FastCaloSimAnalyzer_LIB} + ) +endif() diff --git a/FastCaloSimAnalyzer/macro/runTFCSStdParTest.cxx b/FastCaloSimAnalyzer/macro/runTFCSStdParTest.cxx new file mode 100644 index 0000000..f397aa5 --- /dev/null +++ b/FastCaloSimAnalyzer/macro/runTFCSStdParTest.cxx @@ -0,0 +1,37 @@ +#include +#include +#include +#include "citer.h" +#include + +#include "FastCaloSimAnalyzer/TFCSStdParTest.h" + +static const char* USAGE = + R"(Run test for stdpar + +Usage: + runTFCSStdParTest [--doAtomicTest] [--doVectorTest] [-n | --num ] + runTFCSStdParTest (-h | --help) + +Options: + -h --help Show help screen. + --doAtomicTest Do test for atomic increments [default: false]. + --doVectorTest Do test for allocating/accessing vector [default: false]. + -n --num Size of array to allocate for tests [default: 10]. +)"; + + +int main( int argc, char** argv ) { + + std::map args = docopt::docopt( USAGE, {argv + 1, argv + argc}, true ); + + bool doAtomic = args["--doAtomicTest"].asBool(); + bool doVec = args["--doVectorTest"].asBool(); + int num = args["--num"].asLong(); + + TFCSStdParTest test; + + test.test(doAtomic,doVec,num); + + return 0; +} diff --git a/README.md b/README.md index 50a4e1c..c20c44b 100644 --- a/README.md +++ b/README.md @@ -185,6 +185,34 @@ cmake ../src/FastCaloSimAnalyzer \ -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off ``` +### BNL CSI lambda2: HIP for AMD + +``` +module use /work/software/modulefiles +module load rocmmod4.5.0 +source /work/atif/packages/root-6.24-gcc-9.3.0/bin/thisroot.sh +export FCS_DATAPATH=/work/atif/FastCaloSimInputs/ +/work/atif/packages/cmake-3.25.0-linux-x86_64/bin/cmake ../FastCaloSimAnalyzer \ + -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=hipcc \ + -DCMAKE_CXX_STANDARD=14 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on \ + -DUSE_HIP=on -DHIP_TARGET=AMD -DCMAKE_CXX_FLAGS="-I/opt/rocm/hip/include/hip/" +``` + +### Perlmutter: HIP for Nvidia + +For Nvidia backend with HIP select HIP_PLATFORM=nvidia, HIP_COMPILER=nvcc, HIP_RUNTIME=cuda and use hipcc_nvidia compiler script + +``` +export HIP_PLATFORM=nvidia +export HIP_COMPILER=nvcc +export HIP_RUNTIME=cuda +module load hip +export ROCM_PATH=/global/common/software/nersc/pe/rocm/5.5.1 +export FCS_DATAPATH=/pscratch/sd/a/atif/FastCaloSimInputs +source /global/homes/a/atif/packages/root_install/bin/thisroot.sh +cmake ../FastCaloSimAnalyzer/ -DENABLE_XROOTD=Off -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=/global/homes/a/atif/FCS-GPU/scripts/hipcc_nvidia -DCMAKE_CXX_STANDARD=17 -DCMAKE_CXX_EXTENSIONS=Off -DENABLE_GPU=on -DUSE_HIP=on -DHIP_TARGET=NVIDIA -DCMAKE_LIBRARY_PATH="/opt/nvidia/hpc_sdk/Linux_x86_64/22.7/math_libs/11.7/lib64/;/global/common/software/nersc/pe/rocm/5.5.1/hip/include/hip/" -DRNDGEN_CPU=on +``` + ### alpaka The alpaka version of FastCaloSim has been tested with two backends: CUDA and HIP. For the former backend alpaka should be configured with `-Dalpaka_ACC_GPU_CUDA_ENABLE=ON`, while for the latter one should use `-Dalpaka_ACC_GPU_HIP_ENABLE=ON`. For more information about `CMake` arguments used by alpaka see [this documentation](https://alpaka.readthedocs.io/en/latest/advanced/cmake.html). diff --git a/scripts/hipcc_nvidia b/scripts/hipcc_nvidia new file mode 100755 index 0000000..2c79b1d --- /dev/null +++ b/scripts/hipcc_nvidia @@ -0,0 +1,36 @@ +#!/bin/bash + +HIPCC=$( which hipcc ) +CXX=$( which g++ ) +A=$* + +echo $A | grep -q USE_HIP +HIP=$? + +echo $A | grep -q FastCaloGpu +FC=$? + +if [[ $HIP -eq 0 && $FC -eq 0 ]]; then + X=$( echo $* | sed s/-pipe// ) + X=$( echo $X | sed s/-fsigned-char// ) + X=$( echo $X | sed s/-pthread// ) + X=$( echo $X | sed s/-Wall// ) + X=$( echo $X | sed s/-Wno-long-long// ) + X=$( echo $X | sed s/-Wno-deprecated// ) + X=$( echo $X | sed s/-Wno-unused-local-typedefs// ) + X=$( echo $X | sed s/-Wwrite-strings// ) + X=$( echo $X | sed s/-Wpointer-arith// ) + X=$( echo $X | sed s/-Woverloaded-virtual// ) + X=$( echo $X | sed s/-Wextra// ) + X=$( echo $X | sed s/-Werror=return-type// ) + X=$( echo $X | sed s/-fPIC/"-shared -Xcompiler -fPIC"/ ) + + CXX=${HIPCC} + CXXARGS="$X" +else + A=$( echo $A ) + + CXXARGS=$A +fi + +$CXX $CXXARGS diff --git a/scripts/nvc++_p b/scripts/nvc++_p index 6e20326..cfed91b 100755 --- a/scripts/nvc++_p +++ b/scripts/nvc++_p @@ -8,38 +8,17 @@ A=$* ## These can be generated with ## > makelocalrc -gcc PATH_TO_GCC -gpp PATH_TO_G++ -x -d PATH_TO_LOCALRC_DIR # -LOCALRC="" -GCCVER=$( gcc --version | head -1 | awk '{print $NF}' ) -case $GCCVER in - 12.3.0) - LOCALRC="${NVHPC_ROOT}/localrc_gcc123" - ;; - 11.4.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc114" - ;; - 11.3.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc113" - ;; - 11.2.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc112" - ;; - 10.2.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc102" - ;; - 10.1.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc101" - ;; - 9.3.0) - LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc93" - ;; -esac - -if [[ ! -f $LOCALRC ]]; then - echo "nvc++_p ERROR: no local rc file \"$LOCALRC\" found" +if [[ -z ${NVHPC_LOCALRC+x} ]]; then + GCCVER=$( gcc --version | head -1 | awk '{print $NF}' | sed s/'\.'//g | sed s/.$// ) + NVHPC_LOCALRC="${NVHPC_ROOT}/compilers/bin/localrc_gcc${GCCVER}" +fi + +if [[ ! -f $NVHPC_LOCALRC ]]; then + echo "nvc++_p ERROR: no local rc file \"$NVHPC_LOCALRC\" found" exit 1 fi -LOCALRC="-rc=${LOCALRC}" +LOCALRC="-rc=${NVHPC_LOCALRC}" STDPAROPTS="-cudalib=curand"