diff --git a/.github/workflows/doc-generation.yml b/.github/workflows/doc-generation.yml index 646101ccdbd..1991bd5ef58 100644 --- a/.github/workflows/doc-generation.yml +++ b/.github/workflows/doc-generation.yml @@ -17,7 +17,7 @@ jobs: - "/etc/ssh/ssh_known_hosts:/etc/ssh/ssh_known_hosts:ro" steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true diff --git a/.github/workflows/docker-build-env.yml b/.github/workflows/docker-build-env.yml index 7ab8876c78c..5cc6e90e0cb 100644 --- a/.github/workflows/docker-build-env.yml +++ b/.github/workflows/docker-build-env.yml @@ -38,7 +38,7 @@ jobs: steps: - name: Checkout Repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Prepare id: prep diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index 358edd4d4ad..fffcf4f62df 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -12,7 +12,7 @@ jobs: image: ghcr.io/cp2k/dbcsr-build-env-ubuntu-22.04:develop steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: submodules: true diff --git a/.github/workflows/testing-gcc.yml b/.github/workflows/testing-gcc.yml index a986213e5c2..8fec395cc1d 100644 --- a/.github/workflows/testing-gcc.yml +++ b/.github/workflows/testing-gcc.yml @@ -13,7 +13,7 @@ jobs: image: ghcr.io/cp2k/dbcsr-build-env-latest-gcc:develop steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true diff --git a/.github/workflows/testing-linux.yml b/.github/workflows/testing-linux.yml index 991d5252717..32a7d300365 100644 --- a/.github/workflows/testing-linux.yml +++ b/.github/workflows/testing-linux.yml @@ -15,7 +15,7 @@ jobs: container: image: ghcr.io/cp2k/dbcsr-build-env-ubuntu-22.04:develop steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Run pre-commit run: | git config --global --add safe.directory "$GITHUB_WORKSPACE" @@ -40,7 +40,7 @@ jobs: mpi_suffix: mpich steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true @@ -100,7 +100,7 @@ jobs: use_openmp: [OPENMP=ON] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true @@ -134,7 +134,7 @@ jobs: use_smm: [SMM=libxsmm] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true @@ -164,9 +164,10 @@ jobs: strategy: matrix: use_openmp: [OPENMP=ON] + use_g2g: [G2G=ON, G2G=OFF] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true @@ -176,10 +177,11 @@ jobs: mkdir -p build cd build cmake -G Ninja \ - -DCMAKE_BUILD_TYPE=Debug \ + -DCMAKE_BUILD_TYPE=Release \ -DUSE_${{ matrix.use_openmp }} \ -DUSE_ACCEL=hip \ - -DWITH_GPU=Mi100 \ + -DWITH_GPU=Mi250 \ + -DWITH_${{ matrix.use_g2g }} \ -DWITH_EXAMPLES=ON \ -DCMAKE_PREFIX_PATH=/opt/rocm \ .. @@ -194,7 +196,7 @@ jobs: image: ghcr.io/cp2k/dbcsr-build-env-ubuntu-22.04:develop steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - name: Download coverage data uses: actions/download-artifact@v3 diff --git a/.github/workflows/testing-macos.yml b/.github/workflows/testing-macos.yml index 42d50ad84b8..6c5228b521f 100644 --- a/.github/workflows/testing-macos.yml +++ b/.github/workflows/testing-macos.yml @@ -26,7 +26,7 @@ jobs: mpi_suffix: mpich steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: fetch-depth: 0 submodules: true diff --git a/.pre-commit/check_header.py b/.pre-commit/check_header.py index 75c6b40c7c8..3cf7d58233c 100755 --- a/.pre-commit/check_header.py +++ b/.pre-commit/check_header.py @@ -13,7 +13,9 @@ import re import mmap import sys -from os import path +import pathlib +from collections import defaultdict +from os import path, listdir from contextlib import contextmanager TYPES = { @@ -41,14 +43,19 @@ def mmap_open(name, mode="r"): def check_header(header_dir, files, verbose=False): retval = 0 - header_re = {} - header_len = {} - - for headertype in TYPES: - with open(path.join(header_dir, headertype), "rb") as fhandle: - header_content = fhandle.read() - header_re[headertype] = re.compile(re.escape(header_content)) - header_len[headertype] = len(header_content) + header_re = defaultdict(list) + header_len = defaultdict(list) + + for headerfile in listdir(header_dir): + headertype = pathlib.Path(headerfile).stem + if headertype in TYPES: + with open(path.join(header_dir, headerfile), "rb") as fhandle: + header_content = fhandle.read() + header_re[headertype].append(re.compile(re.escape(header_content))) + header_len[headertype].append(len(header_content)) + else: + print("no matching headerfile to file extensions") + sys.exit(1) ext_map = {e: t for t, exts in TYPES.items() for e in exts} @@ -62,9 +69,10 @@ def check_header(header_dir, files, verbose=False): with mmap_open(fpath) as fmapped: header_type = ext_map[fext] - match = header_re[header_type].search( - fmapped, 0, ALLOWED_LINES * MAX_LINE_LENGTH + header_len[header_type] - ) + for h_re, h_len in zip(header_re[header_type], header_len[header_type]): + match = h_re.search(fmapped, 0, ALLOWED_LINES * MAX_LINE_LENGTH + h_len) + if match: + break if not match: print("✗ {} ... required header not found".format(fpath)) diff --git a/.pre-commit/headers/c_cpp b/.pre-commit/headers/c_cpp.1 similarity index 100% rename from .pre-commit/headers/c_cpp rename to .pre-commit/headers/c_cpp.1 diff --git a/.pre-commit/headers/c_cpp.2 b/.pre-commit/headers/c_cpp.2 new file mode 100644 index 00000000000..24c5e9d07bc --- /dev/null +++ b/.pre-commit/headers/c_cpp.2 @@ -0,0 +1,9 @@ +/*------------------------------------------------------------------------------------------------*/ +/* Copyright (C) by the DBCSR developers group - All rights reserved */ +/* Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved */ +/* This file is part of the DBCSR library. */ +/* */ +/* For information on the license, see the LICENSE file. */ +/* For further information please visit https://dbcsr.cp2k.org */ +/* SPDX-License-Identifier: GPL-2.0+ */ +/*------------------------------------------------------------------------------------------------*/ diff --git a/.pre-commit/headers/fortran b/.pre-commit/headers/fortran.1 similarity index 100% rename from .pre-commit/headers/fortran rename to .pre-commit/headers/fortran.1 diff --git a/.pre-commit/headers/fortran.2 b/.pre-commit/headers/fortran.2 new file mode 100644 index 00000000000..9a02814edd4 --- /dev/null +++ b/.pre-commit/headers/fortran.2 @@ -0,0 +1,9 @@ +!--------------------------------------------------------------------------------------------------! +! Copyright (C) by the DBCSR developers group - All rights reserved ! +! Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved ! +! This file is part of the DBCSR library. ! +! ! +! For information on the license, see the LICENSE file. ! +! For further information please visit https://dbcsr.cp2k.org ! +! SPDX-License-Identifier: GPL-2.0+ ! +!--------------------------------------------------------------------------------------------------! diff --git a/.pre-commit/headers/fypp b/.pre-commit/headers/fypp.1 similarity index 100% rename from .pre-commit/headers/fypp rename to .pre-commit/headers/fypp.1 diff --git a/.pre-commit/headers/python b/.pre-commit/headers/python.1 similarity index 100% rename from .pre-commit/headers/python rename to .pre-commit/headers/python.1 diff --git a/AUTHORS b/AUTHORS index e1e3899a02c..bcf316119d5 100644 --- a/AUTHORS +++ b/AUTHORS @@ -4,6 +4,7 @@ Christian Pousa Dorothea Golze Fawzi Mohamed Florian Schiffmann +Gina Sitaraman Harald Forbert H. Bani-Hashemian Iain Bethune @@ -11,6 +12,7 @@ Ilia Sivkov Jan Wilhelm Joost VandeVondele Juerg Hutter +Leopold Grinberg Lianheng Tong Marcella Mauri-Iannuzzi Matthias Krack diff --git a/CMakeLists.txt b/CMakeLists.txt index 6fe704ff952..21f9b8ff9a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,7 @@ cmake_minimum_required(VERSION 3.22) +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE FORCE) + # include our cmake snippets set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_SOURCE_DIR}/cmake) @@ -111,7 +113,13 @@ set_property(CACHE WITH_GPU PROPERTY STRINGS ${SUPPORTED_CUDA_ARCHITECTURES} option(WITH_CUDA_PROFILING "Enable profiling within CUDA" OFF) option(WITH_HIP_PROFILING "Enable profiling within HIP" OFF) +option(WITH_G2G "Enable GPU aware MPI within CUDA/HIP backends" OFF) +if (WITH_G2G AND ((NOT USE_ACCEL) OR ((NOT USE_ACCEL MATCHES "cuda") + AND (NOT USE_ACCEL MATCHES "hip")))) + message( + FATAL_ERROR "GPU aware MPI can only be enabled for HIP/CUDA GPU backends") +endif () # ================================================================================================= # LANGUAGES AND TESTING enable_language(Fortran) @@ -236,6 +244,7 @@ if (USE_ACCEL MATCHES "cuda|hip") endif () if (USE_ACCEL MATCHES "cuda") + enable_language(CUDA) find_package(CUDAToolkit REQUIRED) if (CUDAToolkit_VERSION LESS 5.5) @@ -257,9 +266,15 @@ if (USE_ACCEL MATCHES "cuda") message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) message(STATUS "GPU profiling enabled: " ${WITH_CUDA_PROFILING}) + message(STATUS "GPU aware MPI enabled: " ${WITH_G2G}) endif () if (USE_ACCEL MATCHES "hip") + if (NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES OFF) + endif () + enable_language(HIP) + # Make sure the GPU required is supported list(FIND SUPPORTED_HIP_ARCHITECTURES ${WITH_GPU} GPU_SUPPORTED) if (GPU_SUPPORTED EQUAL -1) @@ -296,6 +311,7 @@ if (USE_ACCEL MATCHES "hip") message(STATUS "Kernel parameters: " ${WITH_GPU_PARAMS}) message(STATUS "GPU architecture number: " ${ACC_ARCH_NUMBER}) message(STATUS "GPU profiling enabled: " ${WITH_HIP_PROFILING}) + message(STATUS "GPU aware MPI enabled: " ${WITH_G2G}) # =================================== BLAS on GPU backend find_package(hipblas CONFIG REQUIRED HINTS ${ROCM_PATH}) @@ -339,3 +355,6 @@ endif () add_subdirectory(docs) include(CustomTargets) + +# Disable LTO +set(CMAKE_INTERPROCEDURAL_OPTIMIZATION FALSE FORCE) diff --git a/docs/guide/2-user-guide/1-installation/index.md b/docs/guide/2-user-guide/1-installation/index.md index fc06dbb2b86..33d71864284 100644 --- a/docs/guide/2-user-guide/1-installation/index.md +++ b/docs/guide/2-user-guide/1-installation/index.md @@ -69,6 +69,8 @@ make -DUSE_SMM= -DUSE_ACCEL= -DWITH_CUDA_PROFILING= +-DWITH_HIP_PROFILING= +-DWITH_G2G= -DWITH_C_API= -DWITH_EXAMPLES= -DWITH_GPU= diff --git a/docs/guide/3-developer-guide/3-programming/1-overview/index.md b/docs/guide/3-developer-guide/3-programming/1-overview/index.md index d55b9b3f30f..27f6bda40d0 100644 --- a/docs/guide/3-developer-guide/3-programming/1-overview/index.md +++ b/docs/guide/3-developer-guide/3-programming/1-overview/index.md @@ -55,3 +55,4 @@ Assumed square matrix with 20x20 matrix with 5x5 blocks and a 2x2 processor grid | `__CUDA_PROFILING` | To turn on Nvidia Tools Extensions. It requires to link `-lnvToolsExt` | Fortran, C, C++ | | `__CUDA` | Enable CUDA acceleration | C, C++ | | `__HIP` | Enable HIP acceleration | C, C++ | +| `__DBCSR_ACC_G2G` | Enable GPU Aware MPI in CUDA and HIP backends | Fortran, C, C++ | diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index b3d33e4bc5f..f2d2cc22817 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -109,6 +109,7 @@ set(DBCSR_HIP_AND_CUDA_SRCS acc/libsmm_acc/libsmm_acc_benchmark.cpp acc/libsmm_acc/libsmm_acc_init.cpp acc/libsmm_acc/libsmm_acc.cpp + acc/cuda_hip/calculate_norms.cpp acc/cuda_hip/acc_blas.cpp acc/cuda_hip/acc_dev.cpp acc/cuda_hip/acc_error.cpp @@ -122,6 +123,18 @@ set(DBCSR_CUDA_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/cuda/acc_cuda.cpp set(DBCSR_HIP_SRCS ${DBCSR_HIP_AND_CUDA_SRCS} acc/hip/acc_hip.cpp) +if (USE_ACCEL MATCHES "hip") + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES LANGUAGE HIP) + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES COMPILE_FLAGS "-fPIE") +elseif (USE_ACCEL MATCHES "cuda") + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES LANGUAGE CUDA) + set_source_files_properties(acc/cuda_hip/calculate_norms.cpp + PROPERTIES COMPILE_FLAGS "--x cu") +endif () + set(DBCSR_OPENCL_SRCS acc/opencl/smm/opencl_libsmm.c acc/opencl/acc_opencl.c acc/opencl/acc_opencl_event.c acc/opencl/acc_opencl_mem.c @@ -164,6 +177,13 @@ set_target_properties( SOVERSION ${dbcsr_APIVERSION} POSITION_INDEPENDENT_CODE ON) +if (USE_ACCEL MATCHES "hip") + set_target_properties(dbcsr PROPERTIES HIP_ARCHITECTURES "${ACC_ARCH_NUMBER}") +elseif (USE_ACCEL MATCHES "cuda") + set_target_properties(dbcsr PROPERTIES CUDA_ARCHITECTURES + "${ACC_ARCH_NUMBER}") +endif () + if (USE_SMM MATCHES "libxsmm") target_compile_definitions(dbcsr PRIVATE __LIBXSMM) target_link_directories(dbcsr PUBLIC ${LIBXSMM_LIBRARY_DIRS}) @@ -262,6 +282,18 @@ if (USE_ACCEL) $<$:roctx64> $<$:roctracer64> $<$:OpenCL::OpenCL>) + + if (WITH_G2G) + target_compile_definitions( + dbcsr + PRIVATE __DBCSR_ACC_G2G + $<$:__CUDA> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__HIP> + $<$:ARCH_NUMBER=${ACC_ARCH_NUMBER}> + $<$:__CUDA_PROFILING> + $<$:__HIP_PROFILING>) + endif () endif () # ================================================================================================= diff --git a/src/acc/cuda/Makefile b/src/acc/cuda/Makefile index 97665112cf6..79092692035 100644 --- a/src/acc/cuda/Makefile +++ b/src/acc/cuda/Makefile @@ -302,7 +302,7 @@ $(ACCDIR)/dbcsr_acc_smm.a: $(OBJSMM) %.o: %.cpp $(INCALL) $(MAKDIR)/Makefile $(CXX) $(DFLAGS) $(CXXFLAGS) $(CFLAGS_XSMM) -c $< -o $@ -$(DIRSMM)/calculate_norms.o: $(DIRSMM)/calculate_norms.cpp $(INCALL) $(MAKDIR)/Makefile +$(ACCDIR)/cuda_hip/calculate_norms.o: $(ACCDIR)/cuda_hip/calculate_norms.cpp $(INCALL) $(MAKDIR)/Makefile $(NVCC) $(DFLAGS) -x cu -allow-unsupported-compiler \ --compiler-options="$(filter-out -pedantic,$(CXXFLAGS)) $(CFLAGS_XSMM)" -c $< -o $@ diff --git a/src/acc/cuda_hip/acc_dev.cpp b/src/acc/cuda_hip/acc_dev.cpp index 154a59bf529..9028a4a3c5f 100644 --- a/src/acc/cuda_hip/acc_dev.cpp +++ b/src/acc/cuda_hip/acc_dev.cpp @@ -1,5 +1,6 @@ /*------------------------------------------------------------------------------------------------*/ /* Copyright (C) by the DBCSR developers group - All rights reserved */ +/* Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved */ /* This file is part of the DBCSR library. */ /* */ /* For information on the license, see the LICENSE file. */ @@ -20,7 +21,9 @@ #include // for debug purpose +#if defined(__HIP_PLATFORM_NVCC__) static const int verbose_print = 1; +#endif /****************************************************************************/ extern "C" int c_dbcsr_acc_get_ndevices(int* n_devices) { diff --git a/src/acc/cuda_hip/calculate_norms.cpp b/src/acc/cuda_hip/calculate_norms.cpp new file mode 100644 index 00000000000..e267a91d936 --- /dev/null +++ b/src/acc/cuda_hip/calculate_norms.cpp @@ -0,0 +1,117 @@ +/*------------------------------------------------------------------------------------------------*/ +/* Copyright (C) by the DBCSR developers group - All rights reserved */ +/* Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved */ +/* This file is part of the DBCSR library. */ +/* */ +/* For information on the license, see the LICENSE file. */ +/* For further information please visit https://dbcsr.cp2k.org */ +/* SPDX-License-Identifier: GPL-2.0+ */ +/*------------------------------------------------------------------------------------------------*/ + +/***************************************************************************** + * Authors: Gina Sitaraman * + *****************************************************************************/ + +/* + * Execution configuration: + * gridDim.x = number of matrix blocks in this batched norms calculation + * = length of the batched norms calculation stack + * blockIdx.x = {0, ..., gridDim.x-1} + * blockDim.x = warp size (for now, assuming warp size is going to be 64 or 32) + * threadIdx.x = {0, ..., blockDim.x-1} + + * Execute batched norms calculation + + * Function arguments + * --- norms: (pointer to global memory): + * output array of norms, one per matrix in the stack + * --- offsets: (pointer to global memory): + * array of offsets, indicating where each block starts in the "mat" buffer + * --- nelems: (pointer to global memory): + * array of integers, indicating the number of elements in each matrix/block + * --- mat (pointer to global memory): + * arrays containing the matrices for which norms are calculated + + * Algorithm specificities: + * --- warp level primitives are used to reduce within a warp/wavefront, and + * shared memory is used if more than one warp/wavefront is detected + */ + +#if defined(__CUDA) +# include "../cuda/acc_cuda.h" +#elif defined(__HIP) +# include "../hip/acc_hip.h" +#endif +#include "libsmm_acc_init.h" + +template +__global__ void calculate_norms_d( + float* __restrict__ norms, const int* __restrict__ offsets, const int* __restrict__ nelems, const double* __restrict__ mat) { + __shared__ double buf[(blocksz + warpsz - 1) / warpsz]; + double d, sum = 0.0; + + /* Get the offset in the stack that this thread block should handle */ + int blkoffset = offsets[blockIdx.x]; + + /* Get the number of elements in this matrix */ + int nelem = nelems[blockIdx.x]; + + /* Loop over nelem matrix elements for this block */ + for (int i = threadIdx.x; i < nelem; i += blockDim.x) { + /* Load matrix elements, reduce in register */ + d = mat[blkoffset + i]; + sum += d * d; + } + __syncthreads(); + + /* reduce in warp to one value using warp level primitives */ +#if defined(__CUDA) + unsigned mask = 0xffffffff; + for (int offset = warpsz / 2; offset > 0; offset /= 2) { + sum += __shfl_down_sync(mask, sum, offset); + } +#elif defined(__HIP) + for (int offset = warpsz / 2; offset > 0; offset /= 2) { + sum += __shfl_down(sum, offset); + } +#endif + + /* reduce between warps if needed */ + if (blocksz > warpsz) { + if (threadIdx.x % warpsz == 0) { + int warpid = threadIdx.x / warpsz; + buf[warpid] = sum; + } + __syncthreads(); + if (threadIdx.x == 0) { + for (int i = 1; i < blocksz / warpsz; ++i) { + sum += buf[i]; + } + } + } + if (threadIdx.x == 0) { + /* write out this stack's dot product */ + norms[blockIdx.x] = sum; + } +} + +extern "C" int c_calculate_norms(double* mat, int nblks, int* offsets, int* nelems, float* norms, void* stream_ptr) { + int warp_size = acc_get_gpu_warp_size(); + + dim3 grid(nblks); + dim3 block(warp_size); + + ACC_DRV(stream) stream = *((ACC_DRV(stream)*)stream_ptr); + /* block size may be a multiple of warp_size as well */ + if (warp_size == 64) { + calculate_norms_d<64, 64><<>>(norms, offsets, nelems, mat); + } + else if (warp_size == 32) { + calculate_norms_d<32, 32><<>>(norms, offsets, nelems, mat); + } + else { + fprintf(stderr, "Found warp size other than 64 or 32, aborting..\n"); + return -1; + } + return 0; +} diff --git a/src/acc/libsmm_acc/libsmm_acc.cpp b/src/acc/libsmm_acc/libsmm_acc.cpp index ade641a67d3..c7c6b044c4d 100644 --- a/src/acc/libsmm_acc/libsmm_acc.cpp +++ b/src/acc/libsmm_acc/libsmm_acc.cpp @@ -1,5 +1,6 @@ /*------------------------------------------------------------------------------------------------*/ /* Copyright (C) by the DBCSR developers group - All rights reserved */ +/* Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved */ /* This file is part of the DBCSR library. */ /* */ /* For information on the license, see the LICENSE file. */ @@ -140,8 +141,8 @@ inline void jit_kernel(ACC_DRV(function) & kern_func, libsmm_acc_algo algo, int const char* compileOptions[] = {"-D__CUDA", "-w", ARCH_OPTION}; size_t nOptions = 3; #else - const char* compileOptions[] = {"-D__HIP", "-O3", "-w"}; - size_t nOptions = 3; + const char* compileOptions[] = {"-D__HIP", "-O3", "-w", "-munsafe-fp-atomics"}; + size_t nOptions = 4; #endif ACC_RTC(Result) compileResult = ACC_RTC(CompileProgram)(kernel_program, nOptions, compileOptions); if (compileResult != ACC_RTC_SUCCESS) { diff --git a/src/acc/libsmm_acc/tune/tune_setup.py b/src/acc/libsmm_acc/tune/tune_setup.py index b2b2c042e6a..deb8a10011e 100755 --- a/src/acc/libsmm_acc/tune/tune_setup.py +++ b/src/acc/libsmm_acc/tune/tune_setup.py @@ -364,7 +364,9 @@ def gen_makefile(outdir, compiler, arch): + " -w -c -o $@ -std=c++11 $<\n\n" ) else: - output += "\thipcc -O3 -D__TUNING -D__HIP -w -c -o $@ $<\n\n" + output += ( + "\thipcc -O3 -D__TUNING -D__HIP -w -munsafe-fp-atomics -c -o $@ $<\n\n" + ) # compilation rule for kernel files headers = " ".join([f"../{fn}" for fn in Path("../kernels").glob("*.h")]) @@ -372,7 +374,7 @@ def gen_makefile(outdir, compiler, arch): if compiler == "nvcc": output += f" nvcc -O3 -D__TUNING -D__CUDA -arch={str(arch)} -w -c $<\n\n" else: - output += "\thipcc -O3 -D__TUNING -D__HIP -w -c $<\n\n" + output += "\thipcc -O3 -D__TUNING -D__HIP -w -munsafe-fp-atomics -c $<\n\n" # compilation rule for autotuning executables for exe_src in all_exe_src: @@ -396,7 +398,7 @@ def gen_makefile(outdir, compiler, arch): ) else: rocm_path = os.getenv("ROCM_PATH", "/opt/rocm") - output += f"\thipcc -O3 -D__HIP -w -o $@ $^ {rocm_path}/hip/lib/libamdhip64.so\n\n" + output += f"\thipcc -O3 -D__HIP -w -munsafe-fp-atomics -o $@ $^ {rocm_path}/hip/lib/libamdhip64.so\n\n" # write Makefile writefile(outdir / "Makefile", output) diff --git a/src/mm/dbcsr_acc_operations.F b/src/mm/dbcsr_acc_operations.F index 0ea75183df0..c57fa51474d 100644 --- a/src/mm/dbcsr_acc_operations.F +++ b/src/mm/dbcsr_acc_operations.F @@ -67,6 +67,7 @@ FUNCTION libsmm_acc_transpose_cu(trs_stack, offset, nblks, buffer, & TYPE(C_PTR), VALUE :: stream_ptr INTEGER(KIND=C_INT) :: istat END FUNCTION libsmm_acc_transpose_cu + END INTERFACE #endif diff --git a/src/mm/dbcsr_mm.F b/src/mm/dbcsr_mm.F index 3cb711974b4..8cb2485847c 100644 --- a/src/mm/dbcsr_mm.F +++ b/src/mm/dbcsr_mm.F @@ -1,5 +1,6 @@ !--------------------------------------------------------------------------------------------------! ! Copyright (C) by the DBCSR developers group - All rights reserved ! +! Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved ! ! This file is part of the DBCSR library. ! ! ! ! For information on the license, see the LICENSE file. ! @@ -51,7 +52,7 @@ MODULE dbcsr_mm dbcsr_col_block_offsets, dbcsr_col_block_sizes, dbcsr_destroy_array, dbcsr_distribution, & dbcsr_get_matrix_type, dbcsr_has_symmetry, dbcsr_image_dist_release, dbcsr_nblkcols_total, & dbcsr_nfullcols_total, dbcsr_nfullrows_total, dbcsr_release, dbcsr_release_locals, & - dbcsr_row_block_offsets + dbcsr_row_block_offsets, dbcsr_get_data_type USE dbcsr_mm_3D, ONLY: buffers_release, & dbcsr_make_buffers, & get_max_layers_3D, & @@ -60,10 +61,12 @@ MODULE dbcsr_mm release_layers_3D_C_reduction, & request_sync_mult USE dbcsr_mm_cannon, ONLY: make_m2s, & - multiply_cannon + multiply_cannon, & + multiply_cannon_g2g USE dbcsr_mm_common, ONLY: & dbcsr_mpi_statistics, max_memory, memtype_abpanel_1, memtype_abpanel_2, & memtype_mpi_buffer, memtype_mpi_product, memtype_product_wm, memtype_trsbuffer_1, & + memtype_normsbuf, memtype_offsetsbuf, memtype_nelemsbuf, & memtype_trsbuffer_2, num_multiplications, stream_1, stream_2 USE dbcsr_mm_dist_operations, ONLY: dbcsr_create_image_dist, & dbcsr_make_dists_dense, & @@ -96,7 +99,8 @@ MODULE dbcsr_mm USE dbcsr_types, ONLY: & dbcsr_2d_array_type, dbcsr_conjugate_transpose, dbcsr_distribution_obj, & dbcsr_imagedistribution_obj, dbcsr_mp_obj, dbcsr_mpi_size_limits, dbcsr_no_transpose, & - dbcsr_scalar_type, dbcsr_transpose, dbcsr_type, dbcsr_type_antisymmetric + dbcsr_scalar_type, dbcsr_transpose, dbcsr_type, dbcsr_type_antisymmetric, & + dbcsr_type_real_8 USE dbcsr_work_operations, ONLY: dbcsr_add_wm_from_matrix, & dbcsr_finalize, & dbcsr_work_create @@ -185,6 +189,12 @@ SUBROUTINE dbcsr_multiply_lib_finalize() CALL dbcsr_mempool_destruct(memtype_trsbuffer_1%pool) IF (ASSOCIATED(memtype_trsbuffer_2%pool)) & CALL dbcsr_mempool_destruct(memtype_trsbuffer_2%pool) + IF (ASSOCIATED(memtype_normsbuf%pool)) & + CALL dbcsr_mempool_destruct(memtype_normsbuf%pool) + IF (ASSOCIATED(memtype_offsetsbuf%pool)) & + CALL dbcsr_mempool_destruct(memtype_offsetsbuf%pool) + IF (ASSOCIATED(memtype_nelemsbuf%pool)) & + CALL dbcsr_mempool_destruct(memtype_nelemsbuf%pool) IF (ASSOCIATED(memtype_abpanel_1%pool)) & CALL dbcsr_mempool_destruct(memtype_abpanel_1%pool) IF (ASSOCIATED(memtype_abpanel_2%pool)) & @@ -310,6 +320,12 @@ SUBROUTINE dbcsr_multiply_clear_mempools() CALL dbcsr_mempool_clear(memtype_trsbuffer_1%pool) IF (ASSOCIATED(memtype_trsbuffer_2%pool)) & CALL dbcsr_mempool_clear(memtype_trsbuffer_2%pool) + IF (ASSOCIATED(memtype_normsbuf%pool)) & + CALL dbcsr_mempool_clear(memtype_normsbuf%pool) + IF (ASSOCIATED(memtype_offsetsbuf%pool)) & + CALL dbcsr_mempool_clear(memtype_offsetsbuf%pool) + IF (ASSOCIATED(memtype_nelemsbuf%pool)) & + CALL dbcsr_mempool_clear(memtype_nelemsbuf%pool) IF (ASSOCIATED(memtype_abpanel_1%pool)) & CALL dbcsr_mempool_clear(memtype_abpanel_1%pool) IF (ASSOCIATED(memtype_abpanel_2%pool)) & @@ -388,7 +404,7 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & CHARACTER :: transa_l, transb_l INTEGER :: f_col, f_k, f_row, handle, handle2, ithread, l_col, l_k, l_row, & nimages_left_rows, nimages_match, nimages_right_cols, npcols, nprows, numnodes, & - output_unit + data_type, output_unit INTEGER(KIND=int_8) :: my_flop LOGICAL :: ab_dense, keep_product_data, keep_sparsity, product_reindex, release_tdist, & transpose_left, transpose_right, use_dense_mult, use_mempools, thread_dist_force @@ -455,8 +471,17 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_memtype_setup(memtype_trsbuffer_2, has_pool=.TRUE., & acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_2) + CALL dbcsr_memtype_setup(memtype_normsbuf, has_pool=.TRUE., & + acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) + CALL dbcsr_memtype_setup(memtype_offsetsbuf, has_pool=.TRUE., & + acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) + CALL dbcsr_memtype_setup(memtype_nelemsbuf, has_pool=.TRUE., & + acc_hostalloc=.TRUE., acc_devalloc=.TRUE., acc_stream=stream_1) CALL dbcsr_mempool_limit_capacity(memtype_trsbuffer_1%pool, capacity=1) CALL dbcsr_mempool_limit_capacity(memtype_trsbuffer_2%pool, capacity=1) + CALL dbcsr_mempool_limit_capacity(memtype_normsbuf%pool, capacity=1) + CALL dbcsr_mempool_limit_capacity(memtype_offsetsbuf%pool, capacity=1) + CALL dbcsr_mempool_limit_capacity(memtype_nelemsbuf%pool, capacity=1) END IF CALL dbcsr_memtype_setup(memtype_mpi_buffer, mpi=.TRUE.) @@ -880,10 +905,28 @@ SUBROUTINE dbcsr_multiply_generic(transa, transb, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) ELSE + data_type = dbcsr_get_data_type(product_matrix) +#if defined (__DBCSR_ACC_G2G) + IF (data_type .NE. dbcsr_type_real_8) THEN + ! If G2G is enabled, norms have to be calculated on the GPU. + ! Since the norms kernel expects only real_8 type data, we + ! avoid using G2G for all other data types + CALL multiply_cannon(m2s_left, m2s_right, product_matrix, & + retain_sparsity=retain_sparsity, & + filter_eps=filter_eps, & + flop=my_flop, keep_product_data=keep_product_data) + ELSE + CALL multiply_cannon_g2g(m2s_left, m2s_right, product_matrix, & + retain_sparsity=retain_sparsity, & + filter_eps=filter_eps, & + flop=my_flop, keep_product_data=keep_product_data) + END IF +#else CALL multiply_cannon(m2s_left, m2s_right, product_matrix, & retain_sparsity=retain_sparsity, & filter_eps=filter_eps, & flop=my_flop, keep_product_data=keep_product_data) +#endif CALL dbcsr_finalize(product_matrix, reshuffle=PRESENT(filter_eps) .AND. .NOT. keep_sparsity) END IF ! diff --git a/src/mm/dbcsr_mm_cannon.F b/src/mm/dbcsr_mm_cannon.F index e697bd780af..fffd7f5d109 100644 --- a/src/mm/dbcsr_mm_cannon.F +++ b/src/mm/dbcsr_mm_cannon.F @@ -1,5 +1,6 @@ !--------------------------------------------------------------------------------------------------! ! Copyright (C) by the DBCSR developers group - All rights reserved ! +! Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved ! ! This file is part of the DBCSR library. ! ! ! ! For information on the license, see the LICENSE file. ! @@ -18,6 +19,8 @@ MODULE dbcsr_mm_cannon USE dbcsr_acc_event, ONLY: acc_event_synchronize USE dbcsr_acc_device, ONLY: acc_device_synchronize + USE dbcsr_acc_stream, ONLY: acc_stream_synchronize + USE dbcsr_acc_devmem, ONLY: acc_devmem_cptr USE dbcsr_array_types, ONLY: array_data, & array_exists, & array_i1d_obj, & @@ -64,6 +67,7 @@ MODULE dbcsr_mm_cannon acc_transpose_blocks, calculate_norms, count_mpi_statistics, dbcsr_mm_multrec_type_p, & dbcsr_mpi_statistics, enumerate_blk_sizes, huge_norm, local_filter, max_memory, & memtype_abpanel_1, memtype_abpanel_2, memtype_mpi_buffer, memtype_trsbuffer_1, & + memtype_normsbuf, memtype_offsetsbuf, memtype_nelemsbuf, acc_calculate_norms, & memtype_trsbuffer_2, product_matrix_size_guess, rec_sort_index, setup_buffer_matrix USE dbcsr_mm_dist_operations, ONLY: dbcsr_get_local_vcols, & dbcsr_get_local_vrows, & @@ -105,12 +109,18 @@ MODULE dbcsr_mm_cannon m_memory USE dbcsr_mpiwrap, ONLY: mp_allgather, & mp_alltoall, & + mp_environ, & mp_irecv, & mp_isend, & mp_request_null, & mp_sum, & mp_testany, & - mp_waitall, mp_comm_type, mp_request_type + mp_waitall, & + mp_comm_type, & + mp_request_type + + USE ISO_C_BINDING, ONLY: C_F_POINTER + #include "base/dbcsr_base_uses.f90" !$ USE OMP_LIB, ONLY: omp_get_max_threads, omp_get_thread_num, omp_get_num_threads @@ -128,7 +138,8 @@ MODULE dbcsr_mm_cannon MODULE PROCEDURE dbcsr_switch_d_ptrs END INTERFACE - PUBLIC :: multiply_cannon, make_m2s + PUBLIC :: multiply_cannon, make_m2s, & + multiply_cannon_g2g CONTAINS @@ -1757,6 +1768,1071 @@ SUBROUTINE multiply_cannon(left_set, right_set, product_matrix, & CALL timestop(handle) END SUBROUTINE multiply_cannon + SUBROUTINE multiply_cannon_g2g(left_set, right_set, product_matrix, & + retain_sparsity, & + filter_eps, flop, keep_product_data) + !! Multiplies two DBCSR matrices + !! + !! This function is expected to be called only if __DBCSR_ACC_G2G + !! is enabled and the data type is FP64. + !! + !! If __DBCSR_ACC is enabled, norms are calculated on the GPU and + !! MPI calls reference buffers on the GPU device. Input matrices + !! are copied from host to device only once. For the right matrix, + !! transpose kernel is also called only once and the transposed + !! matrix is transferred over MPI to neighbors. + !! + !! If __DBCSR_ACC is not enabled, all calculations are performed on + !! the CPU and MPI calls reference host buffers. + + TYPE(dbcsr_2d_array_type), POINTER :: left_set, right_set + !! set of imaged left matrices + !! set of imaged right matrices + TYPE(dbcsr_type), INTENT(INOUT) :: product_matrix + !! DBCSR product matrix + LOGICAL, INTENT(IN), OPTIONAL :: retain_sparsity + !! retain the sparsity of the existing product matrix; default is no + REAL(kind=real_8), INTENT(in), OPTIONAL :: filter_eps + INTEGER(KIND=int_8), INTENT(OUT) :: flop + !! effective flop + LOGICAL, INTENT(IN) :: keep_product_data + + CHARACTER(len=*), PARAMETER :: routineN = 'multiply_cannon' + INTEGER, PARAMETER :: idata = 1, ileft = 0, imeta = 2, & + iright = 2 + + INTEGER :: data_type, data_type_byte, handle, handle1, handle2, handle3, i, ithread, & + left_col_image, left_col_mult, left_col_nimages, left_dst_icol, left_dst_irow, & + left_dst_p, left_dst_pcol, left_dst_prow, left_dst_vcol, left_dst_vrow, left_max_nblks, & + left_max_nze, left_myfirstvcol, left_myfirstvrow, left_mypcol, left_myprow, left_npcols, & + left_nprows, left_recv_icol, left_recv_irow, left_recv_p, left_recv_pcol, left_recv_prow, & + left_recv_vcol, left_recv_vrow, left_row_image, left_row_mult, left_row_nimages, & + left_send_icol, left_send_irow, left_send_p, left_send_pcol, left_send_prow + INTEGER :: left_send_vcol, left_send_vrow, left_src_icol, left_src_irow, left_src_p, & + left_src_pcol, left_src_prow, left_src_vcol, left_src_vrow, metronome, min_nimages, & + mynode, nblkrows_used, nsteps_k, nthreads, numnodes, nvirt_k, & + output_unit, right_col_image, right_col_mult, right_col_nimages, right_dst_icol, & + right_dst_irow, right_dst_p, right_dst_pcol, right_dst_prow, right_dst_vcol, & + right_dst_vrow, right_max_nblks, right_max_nze, right_myfirstvcol, right_myfirstvrow, & + right_mypcol, right_myprow, right_npcols, right_nprows, right_recv_icol, right_recv_irow + INTEGER :: right_recv_p, right_recv_pcol, right_recv_prow, right_recv_vcol, right_recv_vrow, & + right_row_image, right_row_mult, right_row_nimages, right_send_icol, right_send_irow, & + right_send_p, right_send_pcol, right_send_prow, right_send_vcol, right_send_vrow, & + right_src_icol, right_src_irow, right_src_p, right_src_pcol, right_src_prow, & + right_src_vcol, right_src_vrow, row, size_guess, size_guess_init, stat, threads_finished, & + threads_finished_read, v_ki, v_ki_left, v_ki_right, max_nblks + INTEGER :: left_numnodes, right_numnodes, left_mynode, right_mynode + INTEGER :: msglen + INTEGER(KIND=int_8) :: flop_single, flop_total, mem + INTEGER, ALLOCATABLE, DIMENSION(:) :: row_counts, total_row_counts + INTEGER, ALLOCATABLE, DIMENSION(:, :, :) :: left_sizes, my_sizes, right_sizes + INTEGER, ALLOCATABLE, DIMENSION(:, :, :, :) :: all_sizes + INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: col_blk_sizes2enum, enum2col_blk_sizes, & + enum2row_blk_sizes, m_sizes, n_sizes, & + row_blk_sizes2enum, left_index_rp, left_index_sp, & + right_index_rp, right_index_sp + INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: k_sizes + INTEGER, DIMENSION(:, :), POINTER, CONTIGUOUS :: left_pgrid, product_pgrid, right_pgrid + INTEGER, SAVE :: mult_id = 0 + LOGICAL :: keep_sparsity, list_indexing, & + otf_filtering + LOGICAL :: copy_left, copy_right + + REAL(kind=sp), ALLOCATABLE, DIMENSION(:) :: left_norms, right_norms, & + row_max_epss + REAL(kind=sp) :: filter_eps_sp + TYPE(dbcsr_2d_array_type), POINTER :: left_buffer_2, left_buffer_calc, & + left_buffer_comm, right_buffer_2, right_buffer_calc, right_buffer_comm + TYPE(dbcsr_data_obj) :: left_data_rp, left_data_sp, & + right_data_rp, right_data_sp + TYPE(dbcsr_data_obj), POINTER :: trs_stackbuf_calc, & + trs_stackbuf_comm + TYPE(dbcsr_data_obj), TARGET :: trs_stackbuf_1, trs_stackbuf_2 + TYPE(dbcsr_data_obj) :: normsbuf, offsetsbuf, nelemsbuf + TYPE(dbcsr_mm_multrec_type_p), DIMENSION(:), ALLOCATABLE :: multrec + TYPE(dbcsr_mp_obj) :: left_mp_obj, product_mp_obj, & + right_mp_obj + TYPE(mp_comm_type) :: grp, left_grp, right_grp, mp_group + TYPE(mp_request_type), DIMENSION(:), ALLOCATABLE :: left_data_rr, left_data_sr, left_index_rr, & + left_index_sr, right_data_rr, right_data_sr, right_index_rr, right_index_sr + +! --------------------------------------------------------------------------- + + CALL timeset(routineN, handle) + NULLIFY (trs_stackbuf_calc, trs_stackbuf_comm) + NULLIFY (row_blk_sizes2enum, enum2row_blk_sizes) + NULLIFY (col_blk_sizes2enum, enum2col_blk_sizes) + NULLIFY (k_sizes) + ! + ALLOCATE (left_buffer_2, right_buffer_2) + mult_id = mult_id + 1 + + IF (PRESENT(retain_sparsity)) THEN + keep_sparsity = retain_sparsity + ELSE + keep_sparsity = .FALSE. + END IF + otf_filtering = PRESENT(filter_eps) + +!$OMP PARALLEL DEFAULT (NONE) & +!$OMP SHARED (multrec, nthreads, product_matrix) +!$OMP MASTER + nthreads = 1 +!$ nthreads = OMP_GET_NUM_THREADS() + IF (.NOT. ASSOCIATED(product_matrix%wms)) & + DBCSR_ABORT("Work matrices do not exist") + IF (SIZE(product_matrix%wms) .NE. nthreads) & + DBCSR_ABORT("Work matrices not correctly sized.") + ALLOCATE (multrec(0:nthreads - 1)) +!$OMP END MASTER +!$OMP END PARALLEL + + output_unit = default_output_unit + flop_total = 0 + ! Set up variables + data_type = dbcsr_get_data_type(product_matrix) + data_type_byte = dbcsr_datatype_sizeof(data_type) + left_row_nimages = left_set%image_dist%i%row_decimation + left_row_mult = left_set%image_dist%i%row_multiplicity + left_col_nimages = left_set%image_dist%i%col_decimation + left_col_mult = left_set%image_dist%i%col_multiplicity + right_row_nimages = right_set%image_dist%i%row_decimation + right_row_mult = right_set%image_dist%i%row_multiplicity + right_col_nimages = right_set%image_dist%i%col_decimation + right_col_mult = right_set%image_dist%i%col_multiplicity + left_mp_obj = dbcsr_distribution_mp(left_set%image_dist%i%main) + right_mp_obj = dbcsr_distribution_mp(right_set%image_dist%i%main) + product_mp_obj = dbcsr_distribution_mp(product_matrix%dist) + numnodes = dbcsr_mp_numnodes(product_mp_obj) + mynode = dbcsr_mp_mynode(product_mp_obj) + left_nprows = dbcsr_mp_nprows(left_mp_obj) + left_npcols = dbcsr_mp_npcols(left_mp_obj) + left_myprow = dbcsr_mp_myprow(left_mp_obj) + left_mypcol = dbcsr_mp_mypcol(left_mp_obj) + left_myfirstvrow = dbcsr_mp_myprow(left_mp_obj)*left_row_nimages + left_myfirstvcol = dbcsr_mp_mypcol(left_mp_obj)*left_col_nimages + right_nprows = dbcsr_mp_nprows(right_mp_obj) + right_npcols = dbcsr_mp_npcols(right_mp_obj) + right_myprow = dbcsr_mp_myprow(right_mp_obj) + right_mypcol = dbcsr_mp_mypcol(right_mp_obj) + right_myfirstvrow = dbcsr_mp_myprow(right_mp_obj)*right_row_nimages + right_myfirstvcol = dbcsr_mp_mypcol(right_mp_obj)*right_col_nimages + mp_group = dbcsr_mp_group(product_mp_obj) + left_pgrid => dbcsr_mp_pgrid(left_mp_obj) + right_pgrid => dbcsr_mp_pgrid(right_mp_obj) + product_pgrid => dbcsr_mp_pgrid(product_mp_obj) + CALL dbcsr_mp_grid_setup(product_mp_obj) + CALL dbcsr_mp_grid_setup(left_mp_obj) + CALL dbcsr_mp_grid_setup(right_mp_obj) + ! + ! Dummy checks + ! left/right matching + IF (left_col_nimages .NE. right_row_mult) & + DBCSR_ABORT("Left/Right image mismatch") + IF (left_col_mult .NE. right_row_nimages) & + DBCSR_ABORT("Left/Right image mismatch") + IF (left_col_nimages*left_npcols .NE. right_row_nimages*right_nprows) & + DBCSR_ABORT("Left/Right total mismatch") + ! product/left matching + IF (left_row_mult*dbcsr_mp_nprows(product_mp_obj) .NE. left_row_nimages*left_nprows) & + DBCSR_ABORT("Product/Left total mismatch") + ! product/left matching + IF (right_col_mult*dbcsr_mp_npcols(product_mp_obj) .NE. right_col_nimages*right_npcols) & + DBCSR_ABORT("Product/Right total mismatch") + ! Limitations + IF (left_row_nimages .NE. 1) & + DBCSR_ABORT("Product/Left matrix process grid mismatch") + IF (left_row_mult .NE. 1) & + DBCSR_ABORT("Product/Left matrix process grid mismatch") + IF (right_col_nimages .NE. 1) & + DBCSR_ABORT("Product/Right matrix process grid mismatch") + IF (right_col_mult .NE. 1) & + DBCSR_ABORT("Product/Right matrix process grid mismatch") + + dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, left_row_nimages*left_col_nimages) + dbcsr_mpi_statistics%nimages = MAX(dbcsr_mpi_statistics%nimages, right_row_nimages*right_col_nimages) + ! + ! Exchange size data + ALLOCATE (my_sizes(4, MAX(left_row_nimages, right_row_nimages), & + MAX(left_col_nimages, right_col_nimages))) + my_sizes(:, :, :) = 0 + DO left_row_image = 1, left_row_nimages + DO left_col_image = 1, left_col_nimages + my_sizes(idata + ileft, left_row_image, left_col_image) & + = dbcsr_data_get_size_referenced( & + left_set%mats(left_row_image, left_col_image)%data_area) + my_sizes(imeta + ileft, left_row_image, left_col_image) = & + left_set%mats(left_row_image, left_col_image)%index & + (dbcsr_slot_size) + END DO + END DO + + DO right_row_image = 1, right_row_nimages + DO right_col_image = 1, right_col_nimages + my_sizes(idata + iright, right_row_image, right_col_image) & + = dbcsr_data_get_size_referenced( & + right_set%mats(right_row_image, right_col_image)%data_area) + my_sizes(imeta + iright, right_row_image, right_col_image) = & + right_set%mats(right_row_image, right_col_image)%index & + (dbcsr_slot_size) + END DO + END DO + + ALLOCATE (all_sizes(4, LBOUND(my_sizes, 2):UBOUND(my_sizes, 2), & + LBOUND(my_sizes, 3):UBOUND(my_sizes, 3), 0:numnodes - 1)) + CALL mp_allgather(my_sizes, all_sizes, mp_group) + ! + ! Count the maximum possible multiplies per row for on-the-fly + ! filtering. + per_row_eps: IF (.NOT. otf_filtering) THEN + ! These arrays must be valid when passed to called subroutines. + ALLOCATE (left_norms(0), right_norms(0), row_max_epss(0), stat=stat) + IF (stat .NE. 0) & + DBCSR_ABORT("Could not allocate memory") + ELSE + IF (careful_mod) THEN + IF (left_set%mats(1, 1)%bcsc) & + DBCSR_ABORT("Can not do on-the-fly filtering with CSC-indexed matrices.") + END IF + IF (dbcsr_has_local_row_index(left_set%mats(1, 1))) THEN + nblkrows_used = dbcsr_nblkrows_local(left_set%mats(1, 1)) + ELSE + nblkrows_used = dbcsr_nblkrows_total(left_set%mats(1, 1)) + END IF + ALLOCATE (row_max_epss(nblkrows_used), stat=stat) + IF (stat .NE. 0) & + DBCSR_ABORT("Could not allocate memory for left epsilons") + ALLOCATE (row_counts(nblkrows_used), stat=stat) + IF (stat .NE. 0) & + DBCSR_ABORT("Could not allocate memory for left row counts") + ! The summation could be done prow-locally but it would + ! complicate the pre-row eps calculation. + ALLOCATE (total_row_counts(nblkrows_used), stat=stat) + IF (stat .NE. 0) & + DBCSR_ABORT("Could not allocate memory for left row counts") + ! Each prow member matrix (npcols * row_images) counts the + ! blocks present in each of its rows. + total_row_counts(:) = 0 + DO left_row_image = 1, left_row_nimages + DO left_col_image = 1, left_col_nimages + list_indexing = & + left_set%mats(left_row_image, left_col_image)%list_indexing + IF (careful_mod) THEN + IF (list_indexing) THEN + IF ((left_set%mats(left_row_image, left_col_image)%nblks)*3 .NE. & + SIZE(left_set%mats(left_row_image, left_col_image)%coo_l)) & + DBCSR_ABORT("Row count mismatch") + ELSE + IF (nblkrows_used + 1 .NE. SIZE(left_set%mats(left_row_image, left_col_image)%row_p)) & + DBCSR_ABORT("Row count mismatch") + END IF + END IF + IF (list_indexing) THEN + CALL count_bins( & + left_set%mats(left_row_image, left_col_image)%nblks, & + left_set%mats(left_row_image, left_col_image)%coo_l(1::3), & + nblkrows_used, row_counts) + ELSE + CALL dbcsr_count_row_index( & + left_set%mats(left_row_image, left_col_image)%row_p, & + row_counts, nblkrows_used) + END IF + total_row_counts(:) = total_row_counts(:) & + + row_counts(:) + END DO + END DO + ! The counted blocks are then summed up + CALL mp_sum(total_row_counts, dbcsr_mp_my_row_group(product_mp_obj)) + ! and used to determine the maximum per-block epsilon. + filter_eps_sp = REAL(filter_eps, KIND=KIND(row_max_epss)) +!$OMP PARALLEL DO DEFAULT (NONE) & +!$OMP SHARED(nblkrows_used,row_max_epss,filter_eps_sp,& +!$OMP total_row_counts) + DO row = 1, nblkrows_used + row_max_epss(row) & + = (filter_eps_sp & + /REAL(MAX(1, total_row_counts(row)), KIND=KIND(row_max_epss)))**2 + END DO +!$OMP END PARALLEL DO + ! + DEALLOCATE (row_counts) + DEALLOCATE (total_row_counts) + END IF per_row_eps + ! + ! The main transfer loop goes through the virtual rows/columns. + ! The number of steps may be smaller if the grid dimension is very + ! non-optimal (both left column images and right row images are > + ! 1). + min_nimages = MIN(left_col_nimages, right_row_nimages) + nvirt_k = left_npcols*left_col_nimages + nsteps_k = nvirt_k/min_nimages + ! + ! Translate the all_sizes to account for pre-distribution. This + ! is just done to simplify lookups. + ALLOCATE (left_sizes(2, 0:left_nprows*left_row_nimages - 1, 0:nvirt_k - 1)) + left_sizes = -1 + DO left_src_vcol = 0, left_col_nimages*left_npcols - 1 + DO left_src_vrow = 0, left_row_nimages*left_nprows - 1 + ! Calculate what was shifted. The left_src_v{row,col} are + ! the "source" rows/columns; the left_dst are the shifted + ! targets where the data was placed in make_images. + CALL image_calculator(left_set%image_dist, & + prow=left_dst_prow, pcol=left_dst_pcol, & + rowi=left_dst_irow, coli=left_dst_icol, & + myvprow=left_src_vrow, myvpcol=left_src_vcol, & + shifting='l') + left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol) + left_sizes(idata, left_src_vrow, left_src_vcol) = & + all_sizes( & + idata + ileft, left_dst_irow, left_dst_icol, left_dst_p) + left_sizes(imeta, left_src_vrow, left_src_vcol) = & + all_sizes( & + imeta + ileft, left_dst_irow, left_dst_icol, left_dst_p) + END DO + END DO + ! + ALLOCATE (right_sizes(2, 0:nvirt_k - 1, 0:right_npcols*right_col_nimages - 1)) + right_sizes = -1 + DO right_src_vcol = 0, right_col_nimages*right_npcols - 1 + DO right_src_vrow = 0, right_row_nimages*right_nprows - 1 + ! Calculate what was shifted. The right_src_v{row,col} are + ! the "source" rows/columns; the right_dst are the shifted + ! targets where the data was placed in make_images. + CALL image_calculator(right_set%image_dist, & + prow=right_dst_prow, pcol=right_dst_pcol, & + rowi=right_dst_irow, coli=right_dst_icol, & + myvprow=right_src_vrow, myvpcol=right_src_vcol, & + shifting='r') + right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol) + right_sizes(idata, right_src_vrow, right_src_vcol) = & + all_sizes( & + idata + iright, right_dst_irow, right_dst_icol, right_dst_p) + right_sizes(imeta, right_src_vrow, right_src_vcol) = & + all_sizes( & + imeta + iright, right_dst_irow, right_dst_icol, right_dst_p) + END DO + END DO + ! + ! Setup product work areas + left_max_nze = MAXVAL(all_sizes(idata + ileft, :, :, :)) + left_max_nblks = MAXVAL(all_sizes(imeta + ileft, :, :, :)) + right_max_nze = MAXVAL(all_sizes(idata + iright, :, :, :)) + right_max_nblks = MAXVAL(all_sizes(imeta + iright, :, :, :)) + !! + ! Evaluate sizes for workspaces + IF (.NOT. keep_sparsity) THEN + IF (has_acc) THEN + size_guess_init = product_matrix_size_guess(left_set%mats(1, 1), right_set%mats(1, 1), product_matrix, & + left_max_nze, right_max_nze, & + left_col_nimages, right_row_nimages, & + nthreads) + ELSE + size_guess_init = 1 + END IF + END IF + ithread = 0 +!$OMP PARALLEL DEFAULT(NONE) & +!$OMP PRIVATE (i, size_guess, ithread) & +!$OMP SHARED (product_matrix, left_max_nze, right_max_nze) & +!$OMP SHARED (left_set, right_set, & +!$OMP left_col_nimages, right_row_nimages) & +!$OMP SHARED (nthreads, keep_sparsity, mynode, size_guess_init) + ! +!$ ithread = OMP_GET_THREAD_NUM() + ! The work arrays have to be setup (actually, not quite sure). + i = ithread + 1 + size_guess = product_matrix%wms(i)%datasize ! Should be minimal + IF (.NOT. keep_sparsity) THEN + size_guess = MAX(size_guess, size_guess_init) + END IF + CALL dbcsr_data_ensure_size(product_matrix%wms(i)%data_area, & + size_guess) + CALL dbcsr_data_set_size_referenced(product_matrix%wms(i)%data_area, & + product_matrix%wms(i)%datasize) + ! XXXXXXX a quick fix right now, allocation with size 1 might actually not be needed at all, + ! but something expects this to be associated + CALL ensure_array_size(product_matrix%wms(i)%row_i, ub=1) + CALL ensure_array_size(product_matrix%wms(i)%col_i, ub=1) + CALL ensure_array_size(product_matrix%wms(i)%blk_p, ub=1) +!$OMP END PARALLEL + + ! update capacity of memory-pools, +1 for the dense case + IF (ASSOCIATED(memtype_abpanel_1%pool)) & + CALL dbcsr_mempool_limit_capacity(memtype_abpanel_1%pool, & + capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1) + IF (ASSOCIATED(memtype_abpanel_2%pool)) & + CALL dbcsr_mempool_limit_capacity(memtype_abpanel_2%pool, & + capacity=left_row_mult*left_col_nimages + right_row_nimages*right_col_mult + 1) + IF (has_acc) THEN + ! enumerate the blocksizes to keep the following 2D-arrays small. + CALL enumerate_blk_sizes(right_set%mats(1, 1)%row_blk_size%low%data, & + dbcsr_max_row_size(right_set%mats(1, 1)), & + row_blk_sizes2enum, enum2row_blk_sizes) + CALL enumerate_blk_sizes(right_set%mats(1, 1)%col_blk_size%low%data, & + dbcsr_max_col_size(right_set%mats(1, 1)), & + col_blk_sizes2enum, enum2col_blk_sizes) + END IF + + ! Save col and row communicators + IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN + right_grp = dbcsr_mp_my_col_group(right_mp_obj) + ELSE + right_grp = dbcsr_mp_group(right_mp_obj) + END IF + IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN + left_grp = dbcsr_mp_my_row_group(left_mp_obj) + ELSE + left_grp = dbcsr_mp_group(left_mp_obj) + END IF + CALL mp_environ(left_numnodes, left_mynode, left_grp) + CALL mp_environ(right_numnodes, right_mynode, right_grp) + + ! + ! Setup the left buffer matrices + ! + CALL buffer_matrices_ensure_size(left_set, index_size=left_max_nblks, & + data_size=left_max_nze) + + CALL setup_buffer_matrices(left_buffer_2, left_row_mult, left_col_nimages, & + left_set%mats(1, 1), index_size=left_max_nblks, & + data_size=left_max_nze) + IF (otf_filtering) THEN + ALLOCATE (left_norms(left_max_nblks), stat=stat) + IF (stat .NE. 0) & + DBCSR_ABORT("Could not allocate memory for left norms") + IF (stat .NE. 0) otf_filtering = .FALSE. + END IF + left_buffer_calc => left_set + left_buffer_comm => left_buffer_2 + ALLOCATE (left_data_sr(left_col_nimages)) + ALLOCATE (left_index_sr(left_col_nimages)) + ALLOCATE (left_data_rr(left_col_nimages)) + ALLOCATE (left_index_rr(left_col_nimages)) + left_data_sr = mp_request_null + left_data_rr = mp_request_null + left_index_sr = mp_request_null + left_index_rr = mp_request_null + + ! Setup buffers for right matrix + CALL buffer_matrices_ensure_size(right_set, index_size=right_max_nblks, & + data_size=right_max_nze) + + CALL setup_buffer_matrices(right_buffer_2, right_row_nimages, right_col_mult, & + right_set%mats(1, 1), index_size=right_max_nblks, data_size=right_max_nze) + IF (otf_filtering) THEN + ALLOCATE (right_norms(right_max_nblks), stat=stat) + IF (stat .NE. 0) & + DBCSR_WARN("Could not allocate memory for right norms") + IF (stat .NE. 0) otf_filtering = .FALSE. + + END IF + IF (has_acc .and. otf_filtering) THEN + max_nblks = MAX(left_max_nblks, right_max_nblks) + CALL dbcsr_data_init(normsbuf) + CALL dbcsr_data_new(normsbuf, data_type=dbcsr_type_real_4, & + data_size=max_nblks, memory_type=memtype_normsbuf) + CALL dbcsr_data_init(offsetsbuf) + CALL dbcsr_data_new(offsetsbuf, data_type=dbcsr_type_int_4, & + data_size=max_nblks, memory_type=memtype_offsetsbuf) + CALL dbcsr_data_init(nelemsbuf) + CALL dbcsr_data_new(nelemsbuf, data_type=dbcsr_type_int_4, & + data_size=max_nblks, memory_type=memtype_nelemsbuf) + END IF + right_buffer_calc => right_set + right_buffer_comm => right_buffer_2 + ALLOCATE (right_data_sr(right_row_nimages)) + ALLOCATE (right_index_sr(right_row_nimages)) + ALLOCATE (right_data_rr(right_row_nimages)) + ALLOCATE (right_index_rr(right_row_nimages)) + right_data_sr = mp_request_null + right_data_rr = mp_request_null + right_index_sr = mp_request_null + right_index_rr = mp_request_null + ! + ALLOCATE (m_sizes(dbcsr_nblkrows_local(product_matrix))) + CALL local_filter(array_data(product_matrix%row_blk_size), array_size(product_matrix%local_rows), & + array_data(product_matrix%local_rows), m_sizes) + ALLOCATE (n_sizes(dbcsr_nblkcols_local(product_matrix))) + CALL local_filter(array_data(product_matrix%col_blk_size), array_size(product_matrix%local_cols), & + array_data(product_matrix%local_cols), n_sizes) + ! +!$OMP PARALLEL & +!$OMP DEFAULT (NONE) & +!$OMP SHARED (left_buffer_comm, right_buffer_comm, product_matrix,& +!$OMP keep_sparsity, filter_eps, row_max_epss, multrec, nthreads, & +!$OMP right_data_sr, right_data_rr, left_data_sr, left_data_rr,& +!$OMP right_index_sr, right_index_rr, left_index_sr, left_index_rr,& +!$OMP m_sizes, n_sizes, keep_product_data), & +!$OMP PRIVATE(ithread) + ithread = 0 +!$ ithread = OMP_GET_THREAD_NUM() + ALLOCATE (multrec(ithread)%p) + CALL dbcsr_mm_multrec_init(multrec(ithread)%p, & + product=product_matrix, & + keep_sparsity=keep_sparsity, & + eps=filter_eps, & + row_max_epss=row_max_epss, & + block_estimate=MAX(product_matrix%nblks, & + left_buffer_comm%mats(1, 1)%nblks, & + right_buffer_comm%mats(1, 1)%nblks)/nthreads, & + right_row_blk_size=array_data(right_buffer_comm%mats(1, 1)%row_blk_size), & + m_sizes=m_sizes, n_sizes=n_sizes, & + keep_product_data=keep_product_data) +!$OMP END PARALLEL + ! + ! Setup indexing + CALL setup_rec_index_2d(left_set, left_row_nimages, left_col_nimages) + CALL setup_rec_index_2d(right_set, right_row_nimages, right_col_nimages) + ! + ! Setup the send/receive data pointers + CALL dbcsr_data_init(left_data_sp) + CALL dbcsr_data_init(left_data_rp) + CALL dbcsr_data_init(right_data_sp) + CALL dbcsr_data_init(right_data_rp) + CALL dbcsr_data_new(left_data_sp, data_type) + CALL dbcsr_data_new(left_data_rp, data_type) + CALL dbcsr_data_new(right_data_sp, data_type) + CALL dbcsr_data_new(right_data_rp, data_type) + + ! Setup transpose stackbuffers + IF (has_acc) THEN + CALL dbcsr_data_init(trs_stackbuf_1) + CALL dbcsr_data_init(trs_stackbuf_2) + CALL dbcsr_data_new(trs_stackbuf_1, data_type=dbcsr_type_int_4, & + data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_1) + CALL dbcsr_data_new(trs_stackbuf_2, data_type=dbcsr_type_int_4, & + data_size=2*right_max_nblks, memory_type=memtype_trsbuffer_2) + trs_stackbuf_calc => trs_stackbuf_1 + trs_stackbuf_comm => trs_stackbuf_2 + END IF + ! + ! Reset indices for virtual images + v_ki_right = 0 + v_ki_left = 0 + ! + ! Here is the main loop. + ! + ! In the first loop iteration, the data is fetched from the + ! sources. In the remaining iterations, the data are exchanged + ! among neighbors. In the last loop only calculations take place. + ! + CALL timeset(routineN//"_loop", handle1) + copy_left = .true. + copy_right = .true. + ! + grouped_k_index: DO metronome = 0, nvirt_k - 1 + ! Wait for right matrix transfer completion. Wait in all but + ! the first loop iteration. + CALL timeset(routineN//"_metrocomm1", handle2) + wait_right: IF (v_ki_right .EQ. right_row_nimages) THEN + ! Reset index + v_ki_right = 0 + IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for right" + ! + CALL mp_waitall(right_data_sr) + CALL mp_waitall(right_data_rr) + CALL mp_waitall(right_index_sr) + CALL mp_waitall(right_index_rr) + ! + ! Repoint indices of right matrices + DO v_ki = 0, right_row_nimages - 1 + CALL dbcsr_repoint_index(right_buffer_calc%mats(v_ki + 1, 1)) + right_buffer_calc%mats(v_ki + 1, 1)%valid = .TRUE. + END DO + END IF wait_right + CALL timestop(handle2) + ! + ! Wait for left matrix transfer completion. Wait in all but + ! the first loop iteration. + CALL timeset(routineN//"_metrocomm3", handle2) + wait_left: IF (v_ki_left .EQ. left_col_nimages) THEN + ! Reset index + v_ki_left = 0 + IF (debug_mod) WRITE (*, '(1X,A)') routineN//" waiting for left" + CALL mp_waitall(left_data_sr) + CALL mp_waitall(left_data_rr) + CALL mp_waitall(left_index_sr) + CALL mp_waitall(left_index_rr) + ! + ! Repoint indices of left matrices + DO v_ki = 0, left_col_nimages - 1 + CALL dbcsr_repoint_index(left_buffer_calc%mats(1, v_ki + 1)) + left_buffer_calc%mats(1, v_ki + 1)%valid = .TRUE. + END DO + END IF wait_left + CALL timestop(handle2) + + v_ki_left = v_ki_left + 1 + v_ki_right = v_ki_right + 1 + + IF (debug_mod) THEN + CALL dbcsr_print(left_buffer_calc%mats(1, v_ki_left), nodata=.TRUE.) + CALL dbcsr_print(right_buffer_calc%mats(v_ki_right, 1), nodata=.TRUE.) + END IF + ! + ! from here the code for dbcsr_mm_driver_inner_init was taken + ! + IF (.FALSE.) WRITE (*, *) routineN//" TICK", metronome + ! Since the right matrix is shifted vertically, the + ! received data always has different notions of "local + ! rows". Thus the local_rows and global_rows must be + ! recalculated. + CALL dbcsr_reset_vlocals(right_buffer_calc%mats(v_ki_right, 1), & + right_set%image_dist) + CALL dbcsr_reset_vlocals(left_buffer_calc%mats(1, v_ki_left), & + left_set%image_dist) + ! + CALL ensure_array_size(k_sizes, ub=array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows)) + CALL local_filter(array_data(right_buffer_calc%mats(v_ki_right, 1)%row_blk_size), & + array_size(right_buffer_calc%mats(v_ki_right, 1)%local_rows), & + array_data(right_buffer_calc%mats(v_ki_right, 1)%local_rows), & + k_sizes) + ! + ! Transfer left and right buffers from host to GPU memory + IF (has_acc) THEN + IF (copy_left) THEN + ! copy left buffer images to device + DO v_ki = 1, left_col_nimages + CALL dbcsr_data_host2dev(left_buffer_calc%mats(1, v_ki)%data_area) + CALL timeset(routineN//"_sync_h2d", handle2) + CALL acc_stream_synchronize(left_buffer_calc%mats(1, v_ki)%data_area%d%memory_type%acc_stream) + CALL timestop(handle2) + END DO + copy_left = .false. + END IF + ! calculate norms for matrices in left buffer + IF (otf_filtering) THEN + left_norms(:) = huge_norm + CALL acc_calculate_norms(left_buffer_calc%mats(1, v_ki_left), & + left_norms, normsbuf, offsetsbuf, nelemsbuf, m_sizes, k_sizes) + END IF + + IF (copy_right) THEN + ! copy right buffer images to device + DO v_ki = 1, right_row_nimages + CALL dbcsr_data_host2dev(right_buffer_calc%mats(v_ki, 1)%data_area) + CALL timeset(routineN//"_sync_h2d", handle2) + CALL acc_stream_synchronize(right_buffer_calc%mats(v_ki, 1)%data_area%d%memory_type%acc_stream) + CALL timestop(handle2) + + ! now transpose right buffer image + CALL acc_transpose_blocks(right_buffer_calc%mats(v_ki, 1), trs_stackbuf_calc, & + k_sizes, n_sizes, & + row_blk_sizes2enum, enum2row_blk_sizes, & + col_blk_sizes2enum, enum2col_blk_sizes) + END DO + ! Wait for transpose to complete before proceeding + CALL timeset(routineN//"_sync_h2d", handle2) + CALL acc_stream_synchronize(trs_stackbuf_calc%d%memory_type%acc_stream) + CALL timestop(handle2) + copy_right = .false. + END IF + ! calculate norms for matrices in right buffer + IF (otf_filtering) THEN + right_norms(:) = huge_norm + CALL acc_calculate_norms(right_buffer_calc%mats(v_ki_right, 1), & + right_norms, normsbuf, offsetsbuf, nelemsbuf, k_sizes, n_sizes) + END IF + END IF + ! + ! Right matrix transfer. Transfer in all but the last loop + ! iteration. + xfer_right: IF (v_ki_right .EQ. 1 .AND. metronome + right_row_nimages .LT. nvirt_k) THEN + DO v_ki = 0, right_row_nimages - 1 + ! Calculate the process to send to. It's the virtual + ! process row -min_nimages up (i.e., smaller row number) + ! from me. + CALL image_calculator(right_set%image_dist, & + prow=right_send_prow, rowi=right_send_irow, & ! output + pcol=right_send_pcol, coli=right_send_icol, & ! output + vprow=right_send_vrow, vpcol=right_send_vcol, & ! output + ! myvprow goes through all of my (process row) images + myvprow=v_ki + right_myfirstvrow, & + myvpcol=right_myfirstvcol, & ! nothing happens in the columns + vprow_shift=-right_row_nimages, & + shifting='0') + ! Calculate which data I send. + CALL image_calculator(right_set%image_dist, & + prow=right_dst_prow, rowi=right_dst_irow, & + pcol=right_dst_pcol, coli=right_dst_icol, & + vprow=right_dst_vrow, vpcol=right_dst_vcol, & + ! myvprows goes through all of my (process row) images + myvprow=v_ki + right_myfirstvrow, & + myvpcol=right_myfirstvcol, & ! nothing happens in the columns + vprow_shift=metronome, & + ! This is with relative shifting. + shifting='R') + right_dst_p = right_pgrid(right_dst_prow, right_dst_pcol) + CALL dbcsr_data_set_pointer( & + area=right_data_sp, & + rsize=right_sizes(idata, right_dst_vrow, right_dst_vcol), & + csize=1, & + pointee=right_buffer_calc%mats(v_ki + 1, 1)%data_area) + right_index_sp => right_buffer_calc%mats( & + v_ki + 1, 1 & + )%index(1: & + right_sizes(imeta, right_dst_vrow, right_dst_vcol)) + ! + ! Calculate the process to receive from + CALL image_calculator(right_set%image_dist, & + prow=right_recv_prow, rowi=right_recv_irow, & + pcol=right_recv_pcol, coli=right_recv_icol, & + vprow=right_recv_vrow, vpcol=right_recv_vcol, & + myvprow=v_ki + right_myfirstvrow, & + myvpcol=right_myfirstvcol, & + vprow_shift=+right_row_nimages, & ! just the opposite as "send to" + shifting='0') + ! Calculate which data I receive + CALL image_calculator(right_set%image_dist, & + prow=right_src_prow, rowi=right_src_irow, & + pcol=right_src_pcol, coli=right_src_icol, & + vprow=right_src_vrow, vpcol=right_src_vcol, & + myvprow=v_ki + right_myfirstvrow, & + myvpcol=right_myfirstvcol, & + ! receive window moves with the metronome + vprow_shift=metronome + right_row_nimages, & + shifting='R') + ! + IF (has_acc) THEN + CALL timeset(routineN//"_acc_sync_right", handle3) + CALL acc_event_synchronize(right_buffer_comm%mats(v_ki + 1, 1)%data_area%d%acc_ready) + CALL timestop(handle3) + END IF + + right_src_p = right_pgrid(right_src_prow, right_src_pcol) + CALL dbcsr_data_set_pointer( & + area=right_data_rp, & + rsize=right_sizes(idata, right_src_vrow, right_src_vcol), & + csize=1, & + pointee=right_buffer_comm%mats(v_ki + 1, 1)%data_area) + right_index_rp => right_buffer_comm%mats( & + v_ki + 1, 1 & + )%index(1: & + right_sizes(imeta, right_src_vrow, right_src_vcol)) + ! + right_send_p = right_pgrid(right_send_prow, right_send_pcol) + right_recv_p = right_pgrid(right_recv_prow, right_recv_pcol) + ! These are column-communicator relative + IF (dbcsr_mp_has_subgroups(right_mp_obj)) THEN + right_send_p = right_send_prow + right_recv_p = right_recv_prow + grp = dbcsr_mp_my_col_group(right_mp_obj) + ELSE + grp = dbcsr_mp_group(right_mp_obj) + END IF + ! + CALL timeset(routineN//"_metrocomm2", handle2) + IF (.not. has_acc) THEN + CALL dbcsr_irecv_any(right_data_rp, right_recv_p, & + grp, right_data_rr(v_ki + 1), tag=right_src_vrow) + ELSE + msglen = right_sizes(idata, right_src_vrow, right_src_vcol) +#if defined (__DBCSR_ACC) + CALL C_F_POINTER(acc_devmem_cptr(right_buffer_comm%mats( & + v_ki + 1, 1)%data_area%d%acc_devmem), & + right_data_rp%d%r_dp, (/msglen/)) +#endif + CALL mp_irecv(right_data_rp%d%r_dp, & + right_recv_p, grp, & + right_data_rr(v_ki + 1), tag=right_src_vrow) + END IF + CALL mp_irecv(right_index_rp, right_recv_p, & + grp, right_index_rr(v_ki + 1), tag=right_src_vrow) + IF (.not. has_acc) THEN + CALL dbcsr_isend_any(right_data_sp, right_send_p, & + grp, right_data_sr(v_ki + 1), tag=right_dst_vrow) + ELSE + msglen = right_sizes(idata, right_dst_vrow, right_dst_vcol) +#if defined (__DBCSR_ACC) + CALL C_F_POINTER(acc_devmem_cptr(right_buffer_calc%mats( & + v_ki + 1, 1)%data_area%d%acc_devmem), & + right_data_sp%d%r_dp, (/msglen/)) +#endif + CALL mp_isend(right_data_sp%d%r_dp, & + right_send_p, grp, & + right_data_sr(v_ki + 1), tag=right_dst_vrow) + END IF + CALL mp_isend(right_index_sp, right_send_p, & + grp, right_index_sr(v_ki + 1), tag=right_dst_vrow) + dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1 + CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(1, :), & + dbcsr_data_get_size(right_data_rp), & + data_type_byte, & + dbcsr_mpi_statistics%data_size_breakdown(:, :, 1)) + CALL timestop(handle2) + END DO + END IF xfer_right + ! + ! Left matrix transfer. Transfer in all but the last processor images. + xfer_left: IF (v_ki_left .EQ. 1 .AND. metronome + left_col_nimages .LT. nvirt_k) THEN + DO v_ki = 0, left_col_nimages - 1 + ! Calculate the process to send to. + CALL image_calculator(left_set%image_dist, & + prow=left_send_prow, rowi=left_send_irow, & ! output + pcol=left_send_pcol, coli=left_send_icol, & ! output + vprow=left_send_vrow, vpcol=left_send_vcol, & ! output + myvprow=left_myfirstvrow, & ! nothing happens in the rows + ! go through all my column images + myvpcol=v_ki + left_myfirstvcol, & + ! send to process left_col_nimages left in the grid + vpcol_shift=-left_col_nimages, & + shifting='0') + ! Calculate which data I send. + CALL image_calculator(left_set%image_dist, & + prow=left_dst_prow, rowi=left_dst_irow, & + pcol=left_dst_pcol, coli=left_dst_icol, & + vprow=left_dst_vrow, vpcol=left_dst_vcol, & + myvprow=left_myfirstvrow, & + ! go through all my column images + myvpcol=v_ki + left_myfirstvcol, & + vpcol_shift=metronome, & + ! This is with relative shifting. + shifting='L') + ! + left_dst_p = left_pgrid(left_dst_prow, left_dst_pcol) + CALL dbcsr_data_set_pointer( & + area=left_data_sp, & + rsize=left_sizes(idata, left_dst_vrow, left_dst_vcol), & + csize=1, & + pointee=left_buffer_calc%mats(1, v_ki + 1)%data_area) + left_index_sp => left_buffer_calc%mats( & + 1, v_ki + 1 & + )%index(1: & + left_sizes(imeta, left_dst_vrow, left_dst_vcol)) + ! + ! Calculate the process to receive from + CALL image_calculator(left_set%image_dist, & + prow=left_recv_prow, rowi=left_recv_irow, & + pcol=left_recv_pcol, coli=left_recv_icol, & + vprow=left_recv_vrow, vpcol=left_recv_vcol, & + myvprow=left_myfirstvrow, & + myvpcol=v_ki + left_myfirstvcol, & + vpcol_shift=+left_col_nimages, & ! just the opposite as "send to" + shifting='0') + ! Calculate which data I receive + CALL image_calculator(left_set%image_dist, & + prow=left_src_prow, rowi=left_src_irow, & + pcol=left_src_pcol, coli=left_src_icol, & + vprow=left_src_vrow, vpcol=left_src_vcol, & + myvprow=left_myfirstvrow, & + myvpcol=v_ki + left_myfirstvcol, & + ! receive window moves with the metronome + vpcol_shift=metronome + left_col_nimages, & + shifting='L') + ! + IF (has_acc) THEN + CALL timeset(routineN//"_acc_sync_left", handle3) + CALL acc_event_synchronize(left_buffer_comm%mats(1, v_ki + 1)%data_area%d%acc_ready) + CALL timestop(handle3) + END IF + + left_src_p = left_pgrid(left_src_prow, left_src_pcol) + CALL dbcsr_data_set_pointer( & + area=left_data_rp, & + rsize=left_sizes(idata, left_src_vrow, left_src_vcol), & + csize=1, & + pointee=left_buffer_comm%mats(1, v_ki + 1)%data_area) + left_index_rp => left_buffer_comm%mats( & + 1, v_ki + 1 & + )%index(1: & + left_sizes(imeta, left_src_vrow, left_src_vcol)) + ! + left_send_p = left_pgrid(left_send_prow, left_send_pcol) + left_recv_p = left_pgrid(left_recv_prow, left_recv_pcol) + ! These are column-communicator relative + IF (dbcsr_mp_has_subgroups(left_mp_obj)) THEN + left_send_p = left_send_pcol + left_recv_p = left_recv_pcol + grp = dbcsr_mp_my_row_group(left_mp_obj) + ELSE + grp = dbcsr_mp_group(left_mp_obj) + END IF + ! + CALL timeset(routineN//"_metrocomm4", handle2) + IF (.not. has_acc) THEN + CALL dbcsr_irecv_any(left_data_rp, left_recv_p, & + grp, left_data_rr(v_ki + 1), tag=left_src_vcol) + ELSE + msglen = left_sizes(idata, left_src_vrow, left_src_vcol) +#if defined (__DBCSR_ACC) + CALL C_F_POINTER(acc_devmem_cptr(left_buffer_comm%mats( & + 1, v_ki + 1)%data_area%d%acc_devmem), & + left_data_rp%d%r_dp, (/msglen/)) +#endif + CALL mp_irecv(left_data_rp%d%r_dp, & + left_recv_p, grp, & + left_data_rr(v_ki + 1), tag=left_src_vcol) + END IF + CALL mp_irecv(left_index_rp, left_recv_p, & + grp, left_index_rr(v_ki + 1), tag=left_src_vcol) + IF (.not. has_acc) THEN + CALL dbcsr_isend_any(left_data_sp, left_send_p, & + grp, left_data_sr(v_ki + 1), tag=left_dst_vcol) + ELSE + msglen = left_sizes(idata, left_dst_vrow, left_dst_vcol) +#if defined (__DBCSR_ACC) + CALL C_F_POINTER(acc_devmem_cptr(left_buffer_calc%mats( & + 1, v_ki + 1)%data_area%d%acc_devmem), & + left_data_sp%d%r_dp, (/msglen/)) +#endif + CALL mp_isend(left_data_sp%d%r_dp, & + left_send_p, grp, & + left_data_sr(v_ki + 1), tag=left_dst_vcol) + END IF + CALL mp_isend(left_index_sp, left_send_p, & + grp, left_index_sr(v_ki + 1), tag=left_dst_vcol) + dbcsr_mpi_statistics%nexchanged = dbcsr_mpi_statistics%nexchanged + 1 + CALL count_mpi_statistics(dbcsr_mpi_statistics%data_size(2, :), & + dbcsr_data_get_size(left_data_rp), & + data_type_byte, & + dbcsr_mpi_statistics%data_size_breakdown(:, :, 2)) + CALL timestop(handle2) + END DO + END IF xfer_left + + ! Do multiplication + + ! If no GPU backend, calculate norms on the CPU + IF (otf_filtering .and. .not. has_acc) THEN + left_norms(:) = huge_norm + right_norms(:) = huge_norm + CALL calculate_norms(right_buffer_calc%mats(v_ki_right, 1), & + right_norms, k_sizes, n_sizes) + CALL calculate_norms(left_buffer_calc%mats(1, v_ki_left), & + left_norms, m_sizes, k_sizes) + END IF + ! + flop_single = 0 + threads_finished = 0 + +!$OMP PARALLEL DEFAULT (NONE) & +!$OMP SHARED (left_buffer_calc, right_buffer_calc, & +!$OMP v_ki_left, v_ki_right, handle2, handle3, & +!$OMP product_matrix, multrec,& +!$OMP filter_eps, right_norms, left_norms, row_max_epss, & +!$OMP keep_sparsity,threads_finished, & +!$OMP right_data_sr, right_data_rr, right_index_sr, right_index_rr, & +!$OMP left_data_sr, left_data_rr, left_index_sr, left_index_rr, & +!$OMP dbcsr_cfg, k_sizes, nvirt_k, metronome) & +!$OMP PRIVATE (ithread,nthreads,threads_finished_read) & +!$OMP REDUCTION (+: flop_single) + ithread = 0; nthreads = 1 +!$ ithread = omp_get_thread_num(); nthreads = omp_get_num_threads() + + CALL timeset(routineN//"_multrec", handle2) + + CALL dbcsr_mm_multrec_multiply(multrec(ithread)%p, & + left=left_buffer_calc%mats(1, v_ki_left), & + right=right_buffer_calc%mats(v_ki_right, 1), & + flop=flop_single, & + a_norms=left_norms, b_norms=right_norms, & + k_sizes=k_sizes) + + IF (metronome == nvirt_k - 1) THEN + CALL timeset(routineN//"_multrec_finalize", handle3) + CALL dbcsr_mm_multrec_finalize(multrec(ithread)%p) + DEALLOCATE (multrec(ithread)%p) + CALL timestop(handle3) + END IF + +!$OMP ATOMIC + threads_finished = threads_finished + 1 + IF (dbcsr_cfg%use_comm_thread%val .AND. (ithread .EQ. 0)) THEN + DO +! requires OMP 3.1 (e.g. gcc >=4.7), for correctness, otherwise we keep fingers crossed +#if defined _OPENMP && _OPENMP >= 200711 +!$OMP ATOMIC READ +#endif + threads_finished_read = threads_finished + IF (threads_finished_read .EQ. nthreads) EXIT + ! Using MPI_Testany to trigger forward progress in MPI + CALL mp_testany(right_data_sr) + CALL mp_testany(right_data_rr) + CALL mp_testany(left_data_sr) + CALL mp_testany(left_data_rr) + CALL mp_testany(right_index_sr) + CALL mp_testany(right_index_rr) + CALL mp_testany(left_index_sr) + CALL mp_testany(left_index_rr) + END DO + END IF +!$OMP BARRIER + CALL timestop(handle2) + +!$OMP END PARALLEL + flop_total = flop_total + flop_single + ! + ! Move to the next images + IF (v_ki_left .EQ. left_col_nimages) THEN + CALL dbcsr_switch(left_buffer_calc, left_buffer_comm) + END IF + IF (v_ki_right .EQ. right_row_nimages) THEN + CALL dbcsr_switch(right_buffer_calc, right_buffer_comm) + CALL dbcsr_switch(trs_stackbuf_calc, trs_stackbuf_comm) + END IF + + END DO grouped_k_index + CALL timestop(handle1) + CALL m_memory(mem) + max_memory = MAX(max_memory, REAL(mem)) + + IF (has_acc) THEN + CALL dbcsr_data_release(trs_stackbuf_1) + CALL dbcsr_data_release(trs_stackbuf_2) + DEALLOCATE (row_blk_sizes2enum, enum2row_blk_sizes) + DEALLOCATE (col_blk_sizes2enum, enum2col_blk_sizes) + IF (otf_filtering) THEN + CALL dbcsr_data_release(normsbuf) + CALL dbcsr_data_release(offsetsbuf) + CALL dbcsr_data_release(nelemsbuf) + END IF + END IF + + IF (ALLOCATED(right_norms)) THEN + DEALLOCATE (right_norms) + END IF + IF (ALLOCATED(left_norms)) THEN + DEALLOCATE (left_norms) + END IF + IF (ALLOCATED(row_max_epss)) THEN + DEALLOCATE (row_max_epss) + END IF + ! + CALL dbcsr_destroy_array(right_buffer_2) + CALL dbcsr_destroy_array(left_buffer_2) + DEALLOCATE (my_sizes) + ! + CALL dbcsr_data_clear_pointer(left_data_sp) + CALL dbcsr_data_clear_pointer(left_data_rp) + CALL dbcsr_data_clear_pointer(right_data_sp) + CALL dbcsr_data_clear_pointer(right_data_rp) + CALL dbcsr_data_release(left_data_sp) + CALL dbcsr_data_release(left_data_rp) + CALL dbcsr_data_release(right_data_sp) + CALL dbcsr_data_release(right_data_rp) + ! + DEALLOCATE (left_data_rr, left_data_sr, left_index_rr, left_index_sr, & + right_data_rr, right_data_sr, right_index_rr, right_index_sr) + ! + ! + IF (debug_mod) THEN + v_ki = 0 + DO i = 1, SIZE(product_matrix%blk_p) + v_ki = MAX(v_ki, ABS(product_matrix%blk_p(i))) + END DO + WRITE (*, *) routineN//" Actual final size", & + LOG(REAL(dbcsr_data_get_size(product_matrix%data_area)))/LOG(10.0), & + LOG(REAL(v_ki))/LOG(10.0) + END IF + ! + flop = flop_total + DEALLOCATE (left_buffer_2, right_buffer_2) + DEALLOCATE (m_sizes, n_sizes) + IF (ASSOCIATED(k_sizes)) DEALLOCATE (k_sizes) + ! + CALL timestop(handle) + END SUBROUTINE multiply_cannon_g2g + SUBROUTINE setup_buffer_matrices(buffer_set, buff_nrows, buff_ncols, & source_matrix, index_size, data_size) TYPE(dbcsr_2d_array_type), INTENT(OUT) :: buffer_set diff --git a/src/mm/dbcsr_mm_common.F b/src/mm/dbcsr_mm_common.F index 937043f23e9..af586472eb7 100644 --- a/src/mm/dbcsr_mm_common.F +++ b/src/mm/dbcsr_mm_common.F @@ -1,5 +1,6 @@ !--------------------------------------------------------------------------------------------------! ! Copyright (C) by the DBCSR developers group - All rights reserved ! +! Copyright (C) 2022 Advanced Micro Devices, Inc. - All rights reserved ! ! This file is part of the DBCSR library. ! ! ! ! For information on the license, see the LICENSE file. ! @@ -12,16 +13,22 @@ MODULE dbcsr_mm_common !! Modification history: !! - 2016-08 Code organization (Alfio Lazzaro). + USE ISO_C_BINDING, ONLY: C_PTR, C_INT + USE dbcsr_acc_event, ONLY: acc_event_record, & acc_event_synchronize, & acc_stream_wait_event - USE dbcsr_acc_stream, ONLY: acc_stream_type + USE dbcsr_acc_stream, ONLY: acc_stream_type, & + acc_stream_synchronize, & + acc_stream_cptr + USE dbcsr_acc_devmem, ONLY: acc_devmem_cptr USE dbcsr_array_types, ONLY: array_data, & array_hold USE dbcsr_acc_operations, ONLY: dbcsr_acc_transpose USE dbcsr_data_methods, ONLY: dbcsr_data_ensure_size, & dbcsr_data_get_size, & dbcsr_data_host2dev, & + dbcsr_data_dev2host, & dbcsr_data_set_size_referenced, & dbcsr_get_data_p_c, & dbcsr_get_data_p_d, & @@ -67,6 +74,8 @@ MODULE dbcsr_mm_common TYPE(dbcsr_memtype_type), SAVE :: memtype_abpanel_1, memtype_abpanel_2, & memtype_trsbuffer_1, memtype_trsbuffer_2, & + memtype_normsbuf, memtype_offsetsbuf, & + memtype_nelemsbuf, & memtype_mpi_buffer, memtype_mpi_product TYPE(acc_stream_type), SAVE :: stream_1, stream_2 ! ab-panels and streams are shared between all threads @@ -83,6 +92,8 @@ MODULE dbcsr_mm_common PUBLIC :: memtype_abpanel_1, memtype_abpanel_2, & memtype_trsbuffer_1, memtype_trsbuffer_2, & + memtype_normsbuf, memtype_offsetsbuf, & + memtype_nelemsbuf, & memtype_mpi_buffer, memtype_mpi_product PUBLIC :: stream_1, stream_2 @@ -93,6 +104,7 @@ MODULE dbcsr_mm_common PUBLIC :: enumerate_blk_sizes PUBLIC :: acc_transpose_blocks + PUBLIC :: acc_calculate_norms PUBLIC :: product_matrix_size_guess @@ -100,6 +112,24 @@ MODULE dbcsr_mm_common PUBLIC :: huge_norm PUBLIC :: local_filter +#if defined (__DBCSR_ACC) + INTERFACE + FUNCTION acc_interface_calculate_norms(mat, nblks, offsets, nelems, norms, stream_ptr) RESULT(istat) & + BIND(C, name="c_calculate_norms") + IMPORT + TYPE(C_PTR), INTENT(IN), VALUE :: mat + TYPE(C_PTR), INTENT(IN), VALUE :: offsets + TYPE(C_PTR), INTENT(IN), VALUE :: nelems + TYPE(C_PTR), VALUE :: norms + INTEGER(KIND=C_INT), INTENT(IN), & + VALUE :: nblks + TYPE(C_PTR), VALUE :: stream_ptr + INTEGER(KIND=C_INT) :: istat + + END FUNCTION acc_interface_calculate_norms + END INTERFACE +#endif + CONTAINS SUBROUTINE count_mpi_statistics(mpi_statistics, data_size, & @@ -463,6 +493,101 @@ SUBROUTINE acc_transpose_blocks(matrix, trs_stackbuf, & CALL timestop(handle) END SUBROUTINE acc_transpose_blocks + SUBROUTINE acc_calculate_norms(matrix, norms, normsbuf, offsetsbuf, nelemsbuf, row_blk_sizes, col_blk_sizes) + !! calculate norms for a set of blocks in matrix whose row and col sizes are given + TYPE(dbcsr_type), INTENT(IN) :: matrix + REAL(kind=sp), DIMENSION(:), INTENT(OUT) :: norms + TYPE(dbcsr_data_obj), INTENT(INOUT), TARGET :: normsbuf + TYPE(dbcsr_data_obj), INTENT(INOUT), TARGET :: offsetsbuf + TYPE(dbcsr_data_obj), INTENT(INOUT), TARGET :: nelemsbuf + INTEGER, DIMENSION(:), POINTER, CONTIGUOUS, INTENT(IN) :: row_blk_sizes, col_blk_sizes + + CHARACTER(len=*), PARAMETER :: routineN = 'acc_calculate_norms' + + INTEGER :: nblks, blk_p, handle, i + INTEGER :: nblks_final, j + INTEGER :: data_type + INTEGER :: row, col + INTEGER, DIMENSION(:), POINTER :: blk_index + REAL, DIMENSION(:), POINTER, CONTIGUOUS :: normsbuf_ptr + INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: offsetsbuf_ptr + INTEGER, DIMENSION(:), POINTER, CONTIGUOUS :: nelemsbuf_ptr +#if defined (__DBCSR_ACC) + INTEGER :: istat +#endif + + CALL timeset(routineN, handle) + + blk_index => matrix%coo_l + nblks = matrix%nblks + data_type = dbcsr_get_data_type(matrix) + + if (nblks > 0 .and. data_type .eq. dbcsr_type_real_8) then + + IF (normsbuf%d%data_type /= dbcsr_type_real_4) & + DBCSR_ABORT("acc_calculate_norms: normsbuf has wrong datatype") + IF (offsetsbuf%d%data_type /= dbcsr_type_int_4) & + DBCSR_ABORT("acc_calculate_norms: offsetsbuf has wrong datatype") + IF (nelemsbuf%d%data_type /= dbcsr_type_int_4) & + DBCSR_ABORT("acc_calculate_norms: nelemsbuf has wrong datatype") + + NULLIFY (normsbuf_ptr) + NULLIFY (offsetsbuf_ptr) + NULLIFY (nelemsbuf_ptr) + CALL dbcsr_data_ensure_size(normsbuf, data_size=nblks, nocopy=.TRUE.) + CALL dbcsr_data_set_size_referenced(normsbuf, nblks) + normsbuf_ptr => normsbuf%d%r_sp + CALL dbcsr_data_ensure_size(offsetsbuf, data_size=nblks, nocopy=.TRUE.) + CALL dbcsr_data_set_size_referenced(offsetsbuf, nblks) + offsetsbuf_ptr => offsetsbuf%d%i4 + CALL dbcsr_data_ensure_size(nelemsbuf, data_size=nblks, nocopy=.TRUE.) + CALL dbcsr_data_set_size_referenced(nelemsbuf, nblks) + nelemsbuf_ptr => nelemsbuf%d%i4 + + j = 1 + DO i = 1, nblks + blk_p = blk_index(3*i) + IF (blk_p == 0) CYCLE + offsetsbuf_ptr(j) = blk_p - 1 + row = blk_index(3*i - 2) + col = blk_index(3*i - 1) + nelemsbuf_ptr(j) = row_blk_sizes(row)*col_blk_sizes(col) + j = j + 1 + END DO + nblks_final = j - 1 + + ! copy offsets to GPU buffer, launch kernel, copy norms back to host + ! offsetsbuf, nelemsbuf and normsbuf share the same stream, so no need + ! to synchronize stream until norms are copied back to host + CALL dbcsr_data_host2dev(offsetsbuf) + CALL dbcsr_data_host2dev(nelemsbuf) +#if defined (__DBCSR_ACC) + istat = acc_interface_calculate_norms(acc_devmem_cptr(matrix%data_area%d%acc_devmem), & + INT(nblks_final, KIND=C_INT), & + acc_devmem_cptr(offsetsbuf%d%acc_devmem), & + acc_devmem_cptr(nelemsbuf%d%acc_devmem), & + acc_devmem_cptr(normsbuf%d%acc_devmem), & + acc_stream_cptr(normsbuf%d%memory_type%acc_stream)) + IF (istat == -1) & + DBCSR_ABORT("acc_calculate_norms: warp size obtained is unexpected") +#endif + CALL dbcsr_data_dev2host(normsbuf) + CALL acc_stream_synchronize(normsbuf%d%memory_type%acc_stream) + + j = 1 + DO i = 1, nblks + blk_p = blk_index(3*i) + IF (blk_p == 0) CYCLE + norms(i) = normsbuf_ptr(j) + j = j + 1 + END DO + else + ! call CPU function to calculate norms + CALL calculate_norms(matrix, norms, row_blk_sizes, col_blk_sizes) + end if + CALL timestop(handle) + END SUBROUTINE acc_calculate_norms + FUNCTION product_matrix_size_guess(matrix_left, matrix_right, product_matrix, & !! Guess the size of the product matrix from the A and B sparsities left_data_size, right_data_size, & diff --git a/tools/docker/Dockerfile.build-env-ubuntu-cuda b/tools/docker/Dockerfile.build-env-ubuntu-cuda index c7f9c5b3eb2..bdcc7bc109d 100644 --- a/tools/docker/Dockerfile.build-env-ubuntu-cuda +++ b/tools/docker/Dockerfile.build-env-ubuntu-cuda @@ -1,5 +1,4 @@ FROM nvidia/cuda:12.2.0-devel-ubuntu22.04 - ENV DEBIAN_FRONTEND=noninteractive RUN set -ex ; \ diff --git a/tools/docker/README.md b/tools/docker/README.md index 60cb8cdb01e..60cc6aacb5a 100644 --- a/tools/docker/README.md +++ b/tools/docker/README.md @@ -4,7 +4,7 @@ All images are hosted on the [GitHub Container Registry of the CP2K organization ## Ubuntu Build Environment -The image is based on Ubuntu 20.04 and contains: +The image is based on Ubuntu 22.04 and contains: * GNU Fortran Compiler * OpenBLAS @@ -19,7 +19,7 @@ The image is based on Ubuntu 20.04 and contains: ```console $ cd dbcsr -$ docker run --rm -it -v $PWD:/app --workdir /app --user $(id -u):$(id -g) ghcr.io/cp2k/dbcsr-build-env-ubuntu-20.04 /bin/bash +$ docker run --rm -it -v $PWD:/app --workdir /app --user $(id -u):$(id -g) ghcr.io/cp2k/dbcsr-build-env-ubuntu-22.04 /bin/bash $ mkdir build && cd build/ $ cmake -G Ninja .. $ cmake --build . @@ -31,12 +31,12 @@ If you need to rebuild the image, use: ```console $ cd dbcsr/tools/docker -$ docker build -t dbcsr-build-env-ubuntu-20.04 -f Dockerfile.build-env-ubuntu . +$ docker build -t dbcsr-build-env-ubuntu-22.04 -f Dockerfile.build-env-ubuntu . ``` ## ROCm Build Environment -The image is based on Ubuntu 20.04 and contains: +The image is based on Ubuntu 22.04 and contains: * GNU Fortran Compiler * OpenBLAS