diff --git a/translator/.gitignore b/translator-v2/.gitignore similarity index 100% rename from translator/.gitignore rename to translator-v2/.gitignore diff --git a/translator/Makefile b/translator-v2/Makefile similarity index 100% rename from translator/Makefile rename to translator-v2/Makefile diff --git a/translator/README.md b/translator-v2/README.md similarity index 100% rename from translator/README.md rename to translator-v2/README.md diff --git a/translator/format.sh b/translator-v2/format.sh similarity index 100% rename from translator/format.sh rename to translator-v2/format.sh diff --git a/translator/fpp/LICENSE.txt b/translator-v2/fpp/LICENSE.txt similarity index 100% rename from translator/fpp/LICENSE.txt rename to translator-v2/fpp/LICENSE.txt diff --git a/translator/fpp/fpp b/translator-v2/fpp/fpp similarity index 100% rename from translator/fpp/fpp rename to translator-v2/fpp/fpp diff --git a/translator/op2-translator/__init__.py b/translator-v2/op2-translator/__init__.py similarity index 100% rename from translator/op2-translator/__init__.py rename to translator-v2/op2-translator/__init__.py diff --git a/translator/op2-translator/__main__.py b/translator-v2/op2-translator/__main__.py similarity index 100% rename from translator/op2-translator/__main__.py rename to translator-v2/op2-translator/__main__.py diff --git a/translator/op2-translator/cpp/__init__.py b/translator-v2/op2-translator/cpp/__init__.py similarity index 100% rename from translator/op2-translator/cpp/__init__.py rename to translator-v2/op2-translator/cpp/__init__.py diff --git a/translator/op2-translator/cpp/parser.py b/translator-v2/op2-translator/cpp/parser.py similarity index 100% rename from translator/op2-translator/cpp/parser.py rename to translator-v2/op2-translator/cpp/parser.py diff --git a/translator/op2-translator/cpp/schemes.py b/translator-v2/op2-translator/cpp/schemes.py similarity index 100% rename from translator/op2-translator/cpp/schemes.py rename to translator-v2/op2-translator/cpp/schemes.py diff --git a/translator/op2-translator/cpp/translator/kernels.py b/translator-v2/op2-translator/cpp/translator/kernels.py similarity index 100% rename from translator/op2-translator/cpp/translator/kernels.py rename to translator-v2/op2-translator/cpp/translator/kernels.py diff --git a/translator/op2-translator/cpp/translator/program.py b/translator-v2/op2-translator/cpp/translator/program.py similarity index 100% rename from translator/op2-translator/cpp/translator/program.py rename to translator-v2/op2-translator/cpp/translator/program.py diff --git a/translator/op2-translator/fortran/__init__.py b/translator-v2/op2-translator/fortran/__init__.py similarity index 100% rename from translator/op2-translator/fortran/__init__.py rename to translator-v2/op2-translator/fortran/__init__.py diff --git a/translator/op2-translator/fortran/parser.py b/translator-v2/op2-translator/fortran/parser.py similarity index 100% rename from translator/op2-translator/fortran/parser.py rename to translator-v2/op2-translator/fortran/parser.py diff --git a/translator/op2-translator/fortran/schemes.py b/translator-v2/op2-translator/fortran/schemes.py similarity index 100% rename from translator/op2-translator/fortran/schemes.py rename to translator-v2/op2-translator/fortran/schemes.py diff --git a/translator/op2-translator/fortran/translator/kernels.py b/translator-v2/op2-translator/fortran/translator/kernels.py similarity index 100% rename from translator/op2-translator/fortran/translator/kernels.py rename to translator-v2/op2-translator/fortran/translator/kernels.py diff --git a/translator/op2-translator/fortran/translator/kernels_c.py b/translator-v2/op2-translator/fortran/translator/kernels_c.py similarity index 100% rename from translator/op2-translator/fortran/translator/kernels_c.py rename to translator-v2/op2-translator/fortran/translator/kernels_c.py diff --git a/translator/op2-translator/fortran/translator/program.py b/translator-v2/op2-translator/fortran/translator/program.py similarity index 100% rename from translator/op2-translator/fortran/translator/program.py rename to translator-v2/op2-translator/fortran/translator/program.py diff --git a/translator/op2-translator/fortran/util.py b/translator-v2/op2-translator/fortran/util.py similarity index 100% rename from translator/op2-translator/fortran/util.py rename to translator-v2/op2-translator/fortran/util.py diff --git a/translator/op2-translator/fortran/validator.py b/translator-v2/op2-translator/fortran/validator.py similarity index 100% rename from translator/op2-translator/fortran/validator.py rename to translator-v2/op2-translator/fortran/validator.py diff --git a/translator/op2-translator/jinja.py b/translator-v2/op2-translator/jinja.py similarity index 100% rename from translator/op2-translator/jinja.py rename to translator-v2/op2-translator/jinja.py diff --git a/translator/op2-translator/language.py b/translator-v2/op2-translator/language.py similarity index 100% rename from translator/op2-translator/language.py rename to translator-v2/op2-translator/language.py diff --git a/translator/op2-translator/op.py b/translator-v2/op2-translator/op.py similarity index 100% rename from translator/op2-translator/op.py rename to translator-v2/op2-translator/op.py diff --git a/translator/op2-translator/scheme.py b/translator-v2/op2-translator/scheme.py similarity index 100% rename from translator/op2-translator/scheme.py rename to translator-v2/op2-translator/scheme.py diff --git a/translator/op2-translator/store.py b/translator-v2/op2-translator/store.py similarity index 100% rename from translator/op2-translator/store.py rename to translator-v2/op2-translator/store.py diff --git a/translator/op2-translator/target.py b/translator-v2/op2-translator/target.py similarity index 100% rename from translator/op2-translator/target.py rename to translator-v2/op2-translator/target.py diff --git a/translator/op2-translator/util.py b/translator-v2/op2-translator/util.py similarity index 100% rename from translator/op2-translator/util.py rename to translator-v2/op2-translator/util.py diff --git a/translator/pyproject.toml b/translator-v2/pyproject.toml similarity index 100% rename from translator/pyproject.toml rename to translator-v2/pyproject.toml diff --git a/translator/python/pypy3.10-v7.3.16-linux64.tar.bz2 b/translator-v2/python/pypy3.10-v7.3.16-linux64.tar.bz2 similarity index 100% rename from translator/python/pypy3.10-v7.3.16-linux64.tar.bz2 rename to translator-v2/python/pypy3.10-v7.3.16-linux64.tar.bz2 diff --git a/translator/python/wheels/MarkupSafe-2.1.5.tar.gz b/translator-v2/python/wheels/MarkupSafe-2.1.5.tar.gz similarity index 100% rename from translator/python/wheels/MarkupSafe-2.1.5.tar.gz rename to translator-v2/python/wheels/MarkupSafe-2.1.5.tar.gz diff --git a/translator/python/wheels/clang-14.0.6-py3-none-any.whl b/translator-v2/python/wheels/clang-14.0.6-py3-none-any.whl similarity index 100% rename from translator/python/wheels/clang-14.0.6-py3-none-any.whl rename to translator-v2/python/wheels/clang-14.0.6-py3-none-any.whl diff --git a/translator/python/wheels/fparser-0.1.4-py3-none-any.whl b/translator-v2/python/wheels/fparser-0.1.4-py3-none-any.whl similarity index 100% rename from translator/python/wheels/fparser-0.1.4-py3-none-any.whl rename to translator-v2/python/wheels/fparser-0.1.4-py3-none-any.whl diff --git a/translator/python/wheels/jinja2-3.1.4-py3-none-any.whl b/translator-v2/python/wheels/jinja2-3.1.4-py3-none-any.whl similarity index 100% rename from translator/python/wheels/jinja2-3.1.4-py3-none-any.whl rename to translator-v2/python/wheels/jinja2-3.1.4-py3-none-any.whl diff --git a/translator/python/wheels/mpmath-1.3.0-py3-none-any.whl b/translator-v2/python/wheels/mpmath-1.3.0-py3-none-any.whl similarity index 100% rename from translator/python/wheels/mpmath-1.3.0-py3-none-any.whl rename to translator-v2/python/wheels/mpmath-1.3.0-py3-none-any.whl diff --git a/translator/python/wheels/packaging-24.0-py3-none-any.whl b/translator-v2/python/wheels/packaging-24.0-py3-none-any.whl similarity index 100% rename from translator/python/wheels/packaging-24.0-py3-none-any.whl rename to translator-v2/python/wheels/packaging-24.0-py3-none-any.whl diff --git a/translator/python/wheels/pcpp-1.30-py2.py3-none-any.whl b/translator-v2/python/wheels/pcpp-1.30-py2.py3-none-any.whl similarity index 100% rename from translator/python/wheels/pcpp-1.30-py2.py3-none-any.whl rename to translator-v2/python/wheels/pcpp-1.30-py2.py3-none-any.whl diff --git a/translator/python/wheels/setuptools-70.0.0-py3-none-any.whl b/translator-v2/python/wheels/setuptools-70.0.0-py3-none-any.whl similarity index 100% rename from translator/python/wheels/setuptools-70.0.0-py3-none-any.whl rename to translator-v2/python/wheels/setuptools-70.0.0-py3-none-any.whl diff --git a/translator/python/wheels/setuptools_scm-8.1.0-py3-none-any.whl b/translator-v2/python/wheels/setuptools_scm-8.1.0-py3-none-any.whl similarity index 100% rename from translator/python/wheels/setuptools_scm-8.1.0-py3-none-any.whl rename to translator-v2/python/wheels/setuptools_scm-8.1.0-py3-none-any.whl diff --git a/translator/python/wheels/sympy-1.12.1-py3-none-any.whl b/translator-v2/python/wheels/sympy-1.12.1-py3-none-any.whl similarity index 100% rename from translator/python/wheels/sympy-1.12.1-py3-none-any.whl rename to translator-v2/python/wheels/sympy-1.12.1-py3-none-any.whl diff --git a/translator/python/wheels/tomli-2.0.1-py3-none-any.whl b/translator-v2/python/wheels/tomli-2.0.1-py3-none-any.whl similarity index 100% rename from translator/python/wheels/tomli-2.0.1-py3-none-any.whl rename to translator-v2/python/wheels/tomli-2.0.1-py3-none-any.whl diff --git a/translator/requirements-dev.txt b/translator-v2/requirements-dev.txt similarity index 100% rename from translator/requirements-dev.txt rename to translator-v2/requirements-dev.txt diff --git a/translator/requirements.txt b/translator-v2/requirements.txt similarity index 100% rename from translator/requirements.txt rename to translator-v2/requirements.txt diff --git a/translator/resources/templates/cpp/cuda/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja similarity index 100% rename from translator/resources/templates/cpp/cuda/loop_host.hpp.jinja rename to translator-v2/resources/templates/cpp/cuda/loop_host.hpp.jinja diff --git a/translator/resources/templates/cpp/cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja similarity index 100% rename from translator/resources/templates/cpp/cuda/master_kernel.cu.jinja rename to translator-v2/resources/templates/cpp/cuda/master_kernel.cu.jinja diff --git a/translator/resources/templates/cpp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/loop_host.hpp.jinja similarity index 100% rename from translator/resources/templates/cpp/loop_host.hpp.jinja rename to translator-v2/resources/templates/cpp/loop_host.hpp.jinja diff --git a/translator/resources/templates/cpp/master_kernel.cpp.jinja b/translator-v2/resources/templates/cpp/master_kernel.cpp.jinja similarity index 100% rename from translator/resources/templates/cpp/master_kernel.cpp.jinja rename to translator-v2/resources/templates/cpp/master_kernel.cpp.jinja diff --git a/translator/resources/templates/cpp/openmp/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja similarity index 100% rename from translator/resources/templates/cpp/openmp/loop_host.hpp.jinja rename to translator-v2/resources/templates/cpp/openmp/loop_host.hpp.jinja diff --git a/translator/resources/templates/cpp/openmp/master_kernel.cpp.jinja b/translator-v2/resources/templates/cpp/openmp/master_kernel.cpp.jinja similarity index 100% rename from translator/resources/templates/cpp/openmp/master_kernel.cpp.jinja rename to translator-v2/resources/templates/cpp/openmp/master_kernel.cpp.jinja diff --git a/translator/resources/templates/cpp/seq/loop_host.hpp.jinja b/translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja similarity index 100% rename from translator/resources/templates/cpp/seq/loop_host.hpp.jinja rename to translator-v2/resources/templates/cpp/seq/loop_host.hpp.jinja diff --git a/translator/resources/templates/cpp/seq/master_kernel.cpp.jinja b/translator-v2/resources/templates/cpp/seq/master_kernel.cpp.jinja similarity index 100% rename from translator/resources/templates/cpp/seq/master_kernel.cpp.jinja rename to translator-v2/resources/templates/cpp/seq/master_kernel.cpp.jinja diff --git a/translator/resources/templates/fortran/c_cuda/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/c_cuda/loop_host.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_cuda/loop_host.F90.jinja rename to translator-v2/resources/templates/fortran/c_cuda/loop_host.F90.jinja diff --git a/translator/resources/templates/fortran/c_cuda/loop_host.cuh.jinja b/translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja similarity index 100% rename from translator/resources/templates/fortran/c_cuda/loop_host.cuh.jinja rename to translator-v2/resources/templates/fortran/c_cuda/loop_host.cuh.jinja diff --git a/translator/resources/templates/fortran/c_cuda/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_cuda/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/c_cuda/master_kernel.F90.jinja diff --git a/translator/resources/templates/fortran/c_cuda/master_kernel.cu.jinja b/translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja similarity index 100% rename from translator/resources/templates/fortran/c_cuda/master_kernel.cu.jinja rename to translator-v2/resources/templates/fortran/c_cuda/master_kernel.cu.jinja diff --git a/translator/resources/templates/fortran/c_hip/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/c_hip/loop_host.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_hip/loop_host.F90.jinja rename to translator-v2/resources/templates/fortran/c_hip/loop_host.F90.jinja diff --git a/translator/resources/templates/fortran/c_hip/loop_host.hip.h.jinja b/translator-v2/resources/templates/fortran/c_hip/loop_host.hip.h.jinja similarity index 100% rename from translator/resources/templates/fortran/c_hip/loop_host.hip.h.jinja rename to translator-v2/resources/templates/fortran/c_hip/loop_host.hip.h.jinja diff --git a/translator/resources/templates/fortran/c_hip/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/c_hip/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_hip/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/c_hip/master_kernel.F90.jinja diff --git a/translator/resources/templates/fortran/c_hip/master_kernel.hip.cpp.jinja b/translator-v2/resources/templates/fortran/c_hip/master_kernel.hip.cpp.jinja similarity index 100% rename from translator/resources/templates/fortran/c_hip/master_kernel.hip.cpp.jinja rename to translator-v2/resources/templates/fortran/c_hip/master_kernel.hip.cpp.jinja diff --git a/translator/resources/templates/fortran/c_seq/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/c_seq/loop_host.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_seq/loop_host.F90.jinja rename to translator-v2/resources/templates/fortran/c_seq/loop_host.F90.jinja diff --git a/translator/resources/templates/fortran/c_seq/loop_host.cpp.jinja b/translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja similarity index 100% rename from translator/resources/templates/fortran/c_seq/loop_host.cpp.jinja rename to translator-v2/resources/templates/fortran/c_seq/loop_host.cpp.jinja diff --git a/translator/resources/templates/fortran/c_seq/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/c_seq/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/c_seq/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/c_seq/master_kernel.F90.jinja diff --git a/translator/resources/templates/fortran/cuda/consts.F90.jinja b/translator-v2/resources/templates/fortran/cuda/consts.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/cuda/consts.F90.jinja rename to translator-v2/resources/templates/fortran/cuda/consts.F90.jinja diff --git a/translator/resources/templates/fortran/cuda/loop_host.CUF.jinja b/translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja similarity index 100% rename from translator/resources/templates/fortran/cuda/loop_host.CUF.jinja rename to translator-v2/resources/templates/fortran/cuda/loop_host.CUF.jinja diff --git a/translator/resources/templates/fortran/cuda/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/cuda/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/cuda/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/cuda/master_kernel.F90.jinja diff --git a/translator/resources/templates/fortran/fallback_wrapper.F90.jinja b/translator-v2/resources/templates/fortran/fallback_wrapper.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/fallback_wrapper.F90.jinja rename to translator-v2/resources/templates/fortran/fallback_wrapper.F90.jinja diff --git a/translator/resources/templates/fortran/openmp/loop_host.inc.jinja b/translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja similarity index 100% rename from translator/resources/templates/fortran/openmp/loop_host.inc.jinja rename to translator-v2/resources/templates/fortran/openmp/loop_host.inc.jinja diff --git a/translator/resources/templates/fortran/openmp/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/openmp/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/openmp/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/openmp/master_kernel.F90.jinja diff --git a/translator/resources/templates/fortran/seq/consts.F90.jinja b/translator-v2/resources/templates/fortran/seq/consts.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/seq/consts.F90.jinja rename to translator-v2/resources/templates/fortran/seq/consts.F90.jinja diff --git a/translator/resources/templates/fortran/seq/loop_host.F90.jinja b/translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/seq/loop_host.F90.jinja rename to translator-v2/resources/templates/fortran/seq/loop_host.F90.jinja diff --git a/translator/resources/templates/fortran/seq/master_kernel.F90.jinja b/translator-v2/resources/templates/fortran/seq/master_kernel.F90.jinja similarity index 100% rename from translator/resources/templates/fortran/seq/master_kernel.F90.jinja rename to translator-v2/resources/templates/fortran/seq/master_kernel.F90.jinja diff --git a/translator/setup.py b/translator-v2/setup.py similarity index 100% rename from translator/setup.py rename to translator-v2/setup.py diff --git a/translator/c/README.md b/translator/c/README.md new file mode 100644 index 000000000..e2ce213fb --- /dev/null +++ b/translator/c/README.md @@ -0,0 +1,26 @@ +### C/C++ Code Generators +This directory contains the OP2 code generators written in python targetting the C/C++ API. The parallelisations and optimisations supported by each generator are as follows: + * `op2_gen_seq.py` + * `op2_gen_openmp.py`: Initial OpenMP code generator. + * `op2_gen_openmp_simple.py`: Simplified and Optimized OpenMP code generator. + * `op2_gen_cuda.py`: Optimized for Fermi GPUs. + * `op2_gen_cuda_simple`: Optimized for Kepler GPUs. + * `op2_gen_cuda_simple_hyb.py`: Generates OpenMP code as well as CUDA code into the same file. Both CPUs and GPUs will then be used to do computations as a hybrid application. + +#### Invoking the Code Generator +Uncomment the parallelization you want to code generate in `op2.py`. For example for CUDA code generation do: +``` +#op2_gen_seq(str(sys.argv[1]), date, consts, kernels) +#op2_gen_openmp(str(sys.argv[1]), date, consts, kernels) # Initial OpenMP code generator +op2_gen_cuda(str(sys.argv[1]), date, consts, kernels,sets) # Optimized for Fermi GPUs +``` + +Make `op2.py` executable +``` +chmod a+x ./op2.py +``` + +Invoke the code generator by supplying the files that contain op_* API calls. Thus for example for Airfoil do the following. +``` +./op2.py airfoil.cpp +``` diff --git a/translator/c/format.sh b/translator/c/format.sh new file mode 100755 index 000000000..29ea2cb39 --- /dev/null +++ b/translator/c/format.sh @@ -0,0 +1,43 @@ +#!/bin/bash +#Uses clang-format to format code to conform to the OP2 coding guidelines +# ... currently only applies to files within the current directory +# also only format the files producded by the code generator (i.e. files with kernel in their name) + +#for file in ./*.cu ./*.cpp ./*.h ./*.hpp; do clang-format "$file" > "$file"_temp; mv "$file"_temp "$file"; done + +ls ./*kernel* 2> /dev/null +if [ $? -eq 0 ] +then + for file in ./*kernel*; do clang-format -i "$file"; done +fi +ls ./*_op.cpp 2> /dev/null +if [ $? -eq 0 ] +then + for file in ./*_op.cpp; do clang-format -i "$file"; done +fi + +#ls ./*.cu 2> /dev/null +#if [ $? -eq 0 ] +#then +# for file in ./*.cu; do clang-format -i "$file"; done +#fi +#ls ./*.c 2> /dev/null +#if [ $? -eq 0 ] +#then +# for file in ./*.c ; do clang-format -i "$file"; done +#fi +#ls ./*.cpp 2> /dev/null +#if [ $? -eq 0 ] +#then +# for file in ./*.cpp ; do clang-format -i "$file"; done +#fi +#ls ./*.h 2> /dev/null +#if [ $? -eq 0 ] +#then +# for file in ./*.h ; do clang-format -i "$file"; done +#fi +#ls ./*.hpp 2> /dev/null +#if [ $? -eq 0 ] +#then +# for file in ./*.hpp ; do clang-format -i "$file"; done +#fi diff --git a/translator/c/op2.py b/translator/c/op2.py new file mode 100755 index 000000000..97a9997d2 --- /dev/null +++ b/translator/c/op2.py @@ -0,0 +1,962 @@ +#!/usr/bin/env python3 + +""" +OP2 source code transformation tool + +This tool parses the user's original source code to produce +target-specific code to execute the user's kernel functions. + +This prototype is written in Python and is directly based on the +parsing and code generation of the matlab source code transformation code + +usage: ./op2.py 'file1','file2', ..., [kernel_dir] + +This takes as input + +file1.cpp, file2.cpp, ... + +and produces as output modified versions + +file1_op.cpp, file2_op.cpp, ... + +then calls a number of target-specific code generators +to produce individual kernel files of the form + +xxx_kernel.cpp -- for OpenMP x86 execution +xxx_kernel.cu -- for CUDA execution + +plus a master kernel file of the form + +file1_kernels.cpp -- for OpenMP x86 execution` +file1_kernels.cu -- for CUDA execution + +If user kernel files are located in a sub-directory (e.g. 'kernel_dir'), then +this directory can be provided as argument as well. +""" + +import sys +import re +import datetime +import os + +# Import MPI+SEQ and MPI+autovectorised SEQ +from op2_gen_seq import op2_gen_seq +from op2_gen_mpi_vec import op2_gen_mpi_vec + +# import OpenMP and CUDA code generation functions +from op2_gen_openmp_simple import op2_gen_openmp_simple +from op2_gen_openmp import op2_gen_openmp + +from op2_gen_openacc import op2_gen_openacc + +from op2_gen_cuda import op2_gen_cuda +from op2_gen_cuda_simple import op2_gen_cuda_simple +from op2_gen_cuda_simple_hyb import op2_gen_cuda_simple_hyb +from op2_gen_openmp4 import op2_gen_openmp4 + +from op2_gen_common import * + +# from http://stackoverflow.com/a/241506/396967 +def comment_remover(text): + """Remove comments from text""" + + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return "" + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + + +def op_parse_calls(text): + """Parsing for op_init/op_exit/op_partition/op_hdf5 calls""" + + # remove comments just for this call + text = comment_remover(text) + + inits = len(re.findall('op_init', text)) + exits = len(re.findall('op_exit', text)) + parts = len(re.findall('op_partition', text)) + hdf5s = len(re.findall('hdf5', text)) + + return (inits, exits, parts, hdf5s) + +def op_decl_set_parse(text): + """Parsing for op_decl_set calls""" + + sets = [] + for m in re.finditer('op_decl_set\((.*)\)', text): + args = m.group(1).split(',') + + # check for syntax errors + if len(args) != 2: + print('Error in op_decl_set : must have three arguments') + return + + sets.append({ + 'name': args[1].strip() + }) + for m in re.finditer('op_decl_set_hdf5\((.*)\)', text): + args = m.group(1).split(',') + + # check for syntax errors + if len(args) != 2: + print('Error in op_decl_set : must have three arguments') + return + + sets.append({ + 'name': args[1].strip()[1:-1] + }) + + return sets + + +def op_decl_const_parse(text): + """Parsing for op_decl_const calls""" + + consts = [] + for m in re.finditer('op_decl_const\((.*)\)', text): + args = m.group(1).split(',') + + # check for syntax errors + if len(args) != 3: + print('Error in op_decl_const : must have three arguments') + return + + consts.append({ + 'loc': m.start(), + 'dim': args[0].strip(), + 'type': args[1].strip(), + 'name': args[2].strip(), + 'name2': args[2].strip() + }) + + return consts + + +def extract_declared_globals(text): + """ Parsing for variables that have been declared 'extern' by user """ + + globals_found = [] + global_pattern = r'[\s]*extern[\s]+[\w]+[\s]+([\w]+)' + for match in re.findall(global_pattern, text): + globals_found.append(match) + return globals_found + + +def arg_parse(text, j): + """Parsing arguments in op_par_loop to find the correct closing brace""" + + depth = 0 + loc2 = j + while 1: + if text[loc2] == '(': + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + + +def get_arg_dat(arg_string, j): + loc = arg_parse(arg_string, j + 1) + dat_args_string = arg_string[arg_string.find('(', j) + 1:loc] + + # remove comments + dat_args_string = comment_remover(dat_args_string) + + # check for syntax errors + if len(dat_args_string.split(',')) != 6: + print('Error parsing op_arg_dat(%s): must have six arguments' \ + % dat_args_string) + return + + # split the dat_args_string into 6 and create a struct with the elements + # and type as op_arg_dat + temp_dat = {'type': 'op_arg_dat', + 'dat': dat_args_string.split(',')[0].strip(), + 'idx': dat_args_string.split(',')[1].strip(), + 'map': dat_args_string.split(',')[2].strip(), + 'dim': dat_args_string.split(',')[3].strip(), + 'typ': dat_args_string.split(',')[4].strip(), + 'acc': dat_args_string.split(',')[5].strip(), + 'opt':''} + + return temp_dat + +def get_opt_arg_dat(arg_string, j): + loc = arg_parse(arg_string, j + 1) + dat_args_string = arg_string[arg_string.find('(', j) + 1:loc] + + # remove comments + dat_args_string = comment_remover(dat_args_string) + # check for syntax errors + if len(dat_args_string.split(',')) != 7: + print('Error parsing op_opt_arg_dat(%s): must have 7 arguments' \ + % dat_args_string) + return + + # split the dat_args_string into 6 and create a struct with the elements + # and type as op_arg_dat + temp_dat = {'type': 'op_opt_arg_dat', + 'opt': dat_args_string.split(',')[0].strip(), + 'dat': dat_args_string.split(',')[1].strip(), + 'idx': dat_args_string.split(',')[2].strip(), + 'map': dat_args_string.split(',')[3].strip(), + 'dim': dat_args_string.split(',')[4].strip(), + 'typ': dat_args_string.split(',')[5].strip(), + 'acc': dat_args_string.split(',')[6].strip()} + + return temp_dat + + +def get_arg_gbl(arg_string, k): + loc = arg_parse(arg_string, k + 1) + gbl_args_string = arg_string[arg_string.find('(', k) + 1:loc] + + # remove comments + gbl_args_string = comment_remover(gbl_args_string) + + # check for syntax errors + if len(gbl_args_string.split(',')) != 4: + print('Error parsing op_arg_gbl(%s): must have four arguments' \ + % gbl_args_string) + return + + # split the gbl_args_string into 4 and create a struct with the elements + # and type as op_arg_gbl + temp_gbl = {'type': 'op_arg_gbl', + 'data': gbl_args_string.split(',')[0].strip(), + 'dim': gbl_args_string.split(',')[1].strip(), + 'typ': gbl_args_string.split(',')[2].strip(), + 'acc': gbl_args_string.split(',')[3].strip(), + 'opt':''} + + return temp_gbl + +def append_init_soa(text): + text = re.sub('\\bop_init\\b\\s*\((.*)\)','op_init_soa(\\1,1)', text) + text = re.sub('\\bop_mpi_init\\b\\s*\((.*)\)','op_mpi_init_soa(\\1,1)', text) + return text + +def op_par_loop_parse(text): + """Parsing for op_par_loop calls""" + + loop_args = [] + + search = "op_par_loop" + i = text.find(search) + while i > -1: + arg_string = text[text.find('(', i) + 1:text.find(';', i + 11)] + + # parse arguments in par loop + temp_args = [] + num_args = 0 + + # parse each op_arg_dat + search2 = "op_arg_dat" + search3 = "op_arg_gbl" + search4 = "op_opt_arg_dat" + j = arg_string.find(search2) + k = arg_string.find(search3) + l = arg_string.find(search4) + + while j > -1 or k > -1 or l > -1: + index = min(j if (j > -1) else sys.maxsize,k if (k > -1) else sys.maxsize,l if (l > -1) else sys.maxsize ) + if index == j: + temp_dat = get_arg_dat(arg_string, j) + # append this struct to a temporary list/array + temp_args.append(temp_dat) + num_args = num_args + 1 + j = arg_string.find(search2, j + 11) + + elif index == k: + temp_gbl = get_arg_gbl(arg_string, k) + # append this struct to a temporary list/array + temp_args.append(temp_gbl) + num_args = num_args + 1 + k = arg_string.find(search3, k + 11) + + elif index == l: + temp_dat = get_opt_arg_dat(arg_string, l) + # append this struct to a temporary list/array + temp_args.append(temp_dat) + num_args = num_args + 1 + l = arg_string.find(search4, l + 15) + + temp = {'loc': i, + 'name1': arg_string.split(',')[0].strip(), + 'name2': arg_string.split(',')[1].strip(), + 'set': arg_string.split(',')[2].strip(), + 'args': temp_args, + 'nargs': num_args} + + loop_args.append(temp) + i = text.find(search, i + 10) + print('\n\n') + return (loop_args) + +def op_check_kernel_in_text(text, name): + match = False + inline_impl_pattern = r'inline[ \n]+void[ \n]+'+name+'\s*\(' + matches = re.findall(inline_impl_pattern, text) + decl_pattern = r'([$\n]+)(void[ \n]+'+name+'\([ \n]*'+'[ \nA-Za-z0-9\*\_\.,#]+\);)' + if len(re.findall(inline_impl_pattern, text)) == 1: + match = True + elif len(re.findall(decl_pattern, text)) == 1: + match = True + return match + +def main(srcFilesAndDirs=sys.argv[1:]): + + # declare constants + + ninit = 0 + nexit = 0 + npart = 0 + nhdf5 = 0 + nconsts = 0 + nkernels = 0 + consts = [] + kernels = [] + sets = [] + kernels_in_files = [] + macro_defs = {} + + OP_ID = 1 + OP_GBL = 2 + OP_MAP = 3 + + OP_READ = 1 + OP_WRITE = 2 + OP_RW = 3 + OP_INC = 4 + OP_MAX = 5 + OP_MIN = 6 + + auto_soa=os.getenv('OP_AUTO_SOA','0') + + OP_accs_labels = ['OP_READ', 'OP_WRITE', 'OP_RW', 'OP_INC', + 'OP_MAX', 'OP_MIN'] + + src_files = [s for s in srcFilesAndDirs if os.path.isfile(s)] + src_dirs = [d for d in srcFilesAndDirs if os.path.isdir(d)] + + ## Extract macro definitions: + for src_file in src_files: + print(("Parsing file '" + src_file + "' for macro definitions.")) + with open(src_file, 'r') as f: + text = f.read() + + local_defs = op_parse_macro_defs(text) + for k in list(local_defs.keys()): + if (k in macro_defs) and (local_defs[k] != macro_defs[k]): + msg = "WARNING: Have found two different definitions for macro '{}': '{}' and '{}'. Using the first definition.".format(k, macro_defs[k], local_defs[k]) + print(msg) + continue + else: + macro_defs[k] = local_defs[k] + self_evaluate_macro_defs(macro_defs) + + ## Identify global variables already declared as 'extern': + declared_globals = [] + for src_file in src_files: + with open(src_file, 'r') as f: + text = f.read() + declared_globals += extract_declared_globals(text) + + ## Loop over all input source files to search for op_par_loop calls + kernels_in_files = [[] for _ in range(len(srcFilesAndDirs))] + src_file_num = -1 + for src_file in src_files: + src_file_num = src_file_num + 1 + print(("Processing file " + str(src_file_num+1) + " of " + str(len(src_files)) + \ + ": " + src_file)) + with open(src_file, 'r') as f: + text = f.read() + + any_soa = 0 + + # check for op_init/op_exit/op_partition/op_hdf5 calls + inits, exits, parts, hdf5s = op_parse_calls(text) + + if inits + exits + parts + hdf5s > 0: + print(' ') + if inits > 0: + print('contains op_init call') + if auto_soa!='0': + text = append_init_soa(text) + if exits > 0: + print('contains op_exit call') + if parts > 0: + print('contains op_partition call') + if hdf5s > 0: + print('contains op_hdf5 calls') + + ninit = ninit + inits + nexit = nexit + exits + npart = npart + parts + nhdf5 = nhdf5 + hdf5s + + # parse and process constants + + const_args = op_decl_const_parse(text) + set_list = op_decl_set_parse(text) + for i in range(0, len(set_list)): + sets.append(set_list[i]) + + # cleanup '&' symbols from name and convert dim to integer + for i in range(0, len(const_args)): + const_args[i]['dim'] = evaluate_macro_defs_in_string(macro_defs, const_args[i]['dim']) + + if const_args[i]['name'][0] == '&': + const_args[i]['name'] = const_args[i]['name'][1:] + const_args[i]['dim'] = int(const_args[i]['dim']) + + # check for repeats + nconsts = 0 + for i in range(0, len(const_args)): + repeat = 0 + name = const_args[i]['name'] + for c in range(0, nconsts): + if const_args[i]['name'] == consts[c]['name']: + repeat = 1 + if const_args[i]['type'] != consts[c]['type']: + print('type mismatch in repeated op_decl_const') + if const_args[i]['dim'] != consts[c]['dim']: + print('size mismatch in repeated op_decl_const') + + if repeat > 0: + print('repeated global constant ' + const_args[i]['name']) + else: + print('\nglobal constant (' + const_args[i]['name'].strip() \ + + ') of size ' + str(const_args[i]['dim'])) + + # store away in master list + if repeat == 0: + nconsts = nconsts + 1 + temp = {'dim': const_args[i]['dim'], + 'type': const_args[i]['type'].strip(), + 'name': const_args[i]['name'].strip()} + temp["user_declared"] = temp["name"] in declared_globals + consts.append(temp) + + # parse and process op_par_loop calls + + loop_args = op_par_loop_parse(text) + for i in range(0, len(loop_args)): + name = loop_args[i]['name1'] + nargs = loop_args[i]['nargs'] + print('\nprocessing kernel ' + name + ' with ' + str(nargs) + ' arguments', end=' ') + + # process arguments + + var = [''] * nargs + idxs = [0] * nargs + dims = [''] * nargs + maps = [0] * nargs + mapnames = ['']*nargs + typs = [''] * nargs + accs = [0] * nargs + soaflags = [0] * nargs + optflags = [0] * nargs + any_opt = 0 + + for m in range(0, nargs): + argm = loop_args[i]['args'][m] + argm['dim'] = evaluate_macro_defs_in_string(macro_defs, argm['dim']) + + arg_type = loop_args[i]['args'][m]['type'] + args = loop_args[i]['args'][m] + + if arg_type.strip() == 'op_arg_dat' or arg_type.strip() == 'op_opt_arg_dat': + argm['idx'] = evaluate_macro_defs_in_string(macro_defs, argm['idx']) + + if arg_type.strip() == 'op_arg_dat' or arg_type.strip() == 'op_opt_arg_dat': + var[m] = args['dat'] + idxs[m] = args['idx'] + if arg_type.strip() == 'op_opt_arg_dat': + any_opt = 1 + + if str(args['map']).strip() == 'OP_ID': + maps[m] = OP_ID + if int(idxs[m]) != -1: + print('invalid index for argument' + str(m)) + else: + maps[m] = OP_MAP + mapnames[m] = str(args['map']).strip() + + dims[m] = args['dim'] + soa_loc = args['typ'].find(':soa') + if ((auto_soa=='1') and (((not dims[m].isdigit()) or int(dims[m])>1)) and (soa_loc < 0)): + soa_loc = len(args['typ'])-1 + + if soa_loc > 0: + soaflags[m] = 1 + any_soa = 1 + typs[m] = args['typ'][1:soa_loc] + else: + typs[m] = args['typ'][1:-1] + + + l = -1 + for l in range(0, len(OP_accs_labels)): + if args['acc'].strip() == OP_accs_labels[l].strip(): + break + + if l == -1: + print('unknown access type for argument ' + str(m)) + else: + accs[m] = l + 1 + + if arg_type.strip() == 'op_opt_arg_dat': + optflags[m] = 1 + else: + optflags[m] = 0 + + if arg_type.strip() == 'op_arg_gbl': + maps[m] = OP_GBL + var[m] = args['data'] + dims[m] = args['dim'] + typs[m] = args['typ'][1:-1] + optflags[m] = 0 + + l = -1 + for l in range(0, len(OP_accs_labels)): + if args['acc'].strip() == OP_accs_labels[l].strip(): + break + + if l == -1: + print('unknown access type for argument ' + str(m)) + else: + accs[m] = l + 1 + + if (maps[m] == OP_GBL) and (accs[m] == OP_WRITE or accs[m] == OP_RW): + print('invalid access type for argument ' + str(m)) + + if (maps[m] != OP_GBL) and (accs[m] == OP_MIN or accs[m] == OP_MAX): + print('invalid access type for argument ' + str(m)) + + + print(' ') + + # identify indirect datasets + + ninds = 0 + inds = [0] * nargs + invinds = [0] * nargs + indtyps = [''] * nargs + inddims = [''] * nargs + indaccs = [0] * nargs + invmapinds = [0]*nargs + mapinds = [0]*nargs + + j = [i for i, x in enumerate(maps) if x == OP_MAP] + + while len(j) > 0: + + indtyps[ninds] = typs[j[0]] + inddims[ninds] = dims[j[0]] + indaccs[ninds] = accs[j[0]] + invinds[ninds] = j[0] # inverse mapping + ninds = ninds + 1 + for i in range(0, len(j)): + if var[j[0]] == var[j[i]] and typs[j[0]] == typs[j[i]] \ + and accs[j[0]] == accs[j[i]] and mapnames[j[0]] == mapnames[j[i]]: # same variable + inds[j[i]] = ninds + + k = [] + for i in range(0, len(j)): + if not (var[j[0]] == var[j[i]] and typs[j[0]] == typs[j[i]] + and accs[j[0]] == accs[j[i]] and mapnames[j[0]] == mapnames[j[i]]): # same variable + k = k + [j[i]] + j = k + + if ninds > 0: + invmapinds = invinds[:] + for i in range(0, ninds): + for j in range(0, i): + if (mapnames[invinds[i]] == mapnames[invinds[j]]): + invmapinds[i] = invmapinds[j] + + for i in range(0, nargs): + mapinds[i] = i + for j in range(0, i): + if (maps[i] == OP_MAP) and (mapnames[i] == mapnames[j]) and (idxs[i] == idxs[j]): + mapinds[i] = mapinds[j] + + # check for repeats + + repeat = False + rep1 = False + rep2 = False + which_file = -1 + for nk in range(0, nkernels): + rep1 = kernels[nk]['name'] == name and \ + kernels[nk]['nargs'] == nargs and \ + kernels[nk]['ninds'] == ninds + if rep1: + rep2 = True + for arg in range(0, nargs): + rep2 = rep2 and \ + kernels[nk]['dims'][arg] == dims[arg] and \ + kernels[nk]['maps'][arg] == maps[arg] and \ + kernels[nk]['typs'][arg] == typs[arg] and \ + kernels[nk]['accs'][arg] == accs[arg] and \ + kernels[nk]['idxs'][arg] == idxs[arg] and \ + kernels[nk]['soaflags'][arg] == soaflags[arg] and \ + kernels[nk]['optflags'][arg] == optflags[arg] and \ + kernels[nk]['inds'][arg] == inds[arg] + + for arg in range(0, ninds): + rep2 = rep2 and \ + kernels[nk]['inddims'][arg] == inddims[arg] and \ + kernels[nk]['indaccs'][arg] == indaccs[arg] and \ + kernels[nk]['indtyps'][arg] == indtyps[arg] and \ + kernels[nk]['invinds'][arg] == invinds[arg] + if rep2: + print('repeated kernel with compatible arguments: ' + \ + kernels[nk]['name'], end=' ') + repeat = True + which_file = nk + else: + print('repeated kernel with incompatible arguments: ERROR') + break + + # output various diagnostics + + if not repeat: + print(' local constants:', end=' ') + for arg in range(0, nargs): + if maps[arg] == OP_GBL and accs[arg] == OP_READ: + print(str(arg), end=' ') + print('\n global reductions:', end=' ') + for arg in range(0, nargs): + if maps[arg] == OP_GBL and accs[arg] != OP_READ: + print(str(arg), end=' ') + print('\n direct arguments:', end=' ') + for arg in range(0, nargs): + if maps[arg] == OP_ID: + print(str(arg), end=' ') + print('\n indirect arguments:', end=' ') + for arg in range(0, nargs): + if maps[arg] == OP_MAP: + print(str(arg), end=' ') + if ninds > 0: + print('\n number of indirect datasets: ' + str(ninds), end=' ') + if any_opt: + print('\n optional arguments:', end=' ') + for arg in range(0, nargs): + if optflags[arg] == 1: + print(str(arg), end=' ') + + print('\n') + + # store away in master list + + if not repeat: + nkernels = nkernels + 1 + temp = {'name': name, + 'nargs': nargs, + 'dims': dims, + 'maps': maps, + 'var': var, + 'typs': typs, + 'accs': accs, + 'idxs': idxs, + 'inds': inds, + 'soaflags': soaflags, + 'optflags': optflags, + + 'ninds': ninds, + 'inddims': inddims, + 'indaccs': indaccs, + 'indtyps': indtyps, + 'invinds': invinds, + 'mapnames' : mapnames, + 'mapinds': mapinds, + 'invmapinds' : invmapinds} + kernels.append(temp) + (kernels_in_files[src_file_num]).append(nkernels - 1) + else: + append = 1 + for in_file in range(0, len(kernels_in_files[src_file_num])): + if kernels_in_files[src_file_num][in_file] == which_file: + append = 0 + if append == 1: + (kernels_in_files[src_file_num]).append(which_file) + + # output new source file + src_filename = os.path.basename(src_file) + src_dirpath = os.path.dirname(src_file) + if src_dirpath[0:2] == "./": + src_dirpath = src_dirpath[2:] + + op_extension = "_op" + if '.' in src_filename: + src_filename_pieces = src_filename.split('.') + n = len(src_filename_pieces) + src_filename_extension = src_filename_pieces[n-1] + op_src_filename = '.'.join(src_filename_pieces[0:(n-1)]) + op_extension + '.' + src_filename_extension + else: + op_src_filename = src_filename + op_extension + op_src_filepath = op_src_filename + op_src_dirpath = "" + if src_dirpath != "": + src_dirpath_pieces = src_dirpath.split('/') + root_dir = src_dirpath_pieces[0] + if len(src_dirpath_pieces) == 0: + rem_dirpath = '' + else: + rem_dirpath = '/'.join(src_dirpath_pieces[1:]) + op_src_dirpath = os.path.join(root_dir+"_op", rem_dirpath) + op_src_filepath = os.path.join(op_src_dirpath, op_src_filename) + + if op_src_dirpath != "" and not os.path.exists(op_src_dirpath): + os.makedirs(op_src_dirpath) + fid = open(op_src_filepath, 'w') + date = datetime.datetime.now() + #fid.write('//\n// auto-generated by op2.py on ' + + # date.strftime("%Y-%m-%d %H:%M") + '\n//\n\n') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + + loc_old = 0 + + # read original file and locate header location + header_len = 11 + loc_header = [text.find("op_seq.h")] + if loc_header[0] == -1: + header_len = 13 + loc_header = [text.find("op_lib_cpp.h")] + + # get locations of all op_decl_consts + n_consts = len(const_args) + loc_consts = [0] * n_consts + for n in range(0, n_consts): + loc_consts[n] = const_args[n]['loc'] + + # get locations of all op_par_loops + n_loops = len(loop_args) + loc_loops = [0] * n_loops + for n in range(0, n_loops): + loc_loops[n] = loop_args[n]['loc'] + + locs = sorted(loc_header + loc_consts + loc_loops) + + # process header, loops and constants + for loc in range(0, len(locs)): + if locs[loc] != -1: + fid.write(text[loc_old:locs[loc] - 1]) + loc_old = locs[loc] - 1 + + indent = '' + ind = 0 + while 1: + if text[locs[loc] - ind] == '\n': + break + indent = indent + ' ' + ind = ind + 1 + + if (locs[loc] in loc_header) and (locs[loc] != -1): + fid.write(' "op_lib_cpp.h"\n\n') + fid.write('//\n// op_par_loop declarations\n//\n') + fid.write('#ifdef OPENACC\n#ifdef __cplusplus\nextern "C" {\n#endif\n#endif\n') + for k_iter in range(0, len(kernels_in_files[src_file_num])): + k = kernels_in_files[src_file_num][k_iter] + line = '\nvoid op_par_loop_' + \ + kernels[k]['name'] + '(char const *, op_set,\n' + for n in range(1, kernels[k]['nargs']): + line = line + ' op_arg,\n' + line = line + ' op_arg );\n' + fid.write(line) + for nc in range(0,len(consts)): + fid.write('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,\n') + fid.write(' '+consts[nc]['type'][1:-1]+' *dat);\n') + fid.write('\n') + + fid.write('#ifdef OPENACC\n#ifdef __cplusplus\n}\n#endif\n#endif\n') + fid.write('\n') + loc_old = locs[loc] + header_len-1 + continue + + if locs[loc] in loc_loops: + indent = indent + ' ' * len('op_par_loop') + endofcall = text.find(';', locs[loc]) + curr_loop = loc_loops.index(locs[loc]) + name = loop_args[curr_loop]['name1'] + line = str(' op_par_loop_' + name + '(' + + loop_args[curr_loop]['name2'] + ',' + + loop_args[curr_loop]['set'] + ',\n' + indent) + + for arguments in range(0, loop_args[curr_loop]['nargs']): + elem = loop_args[curr_loop]['args'][arguments] + if elem['type'] == 'op_arg_dat': + line = line + elem['type'] + '(' + elem['dat'] + \ + ',' + elem['idx'] + ',' + elem['map'] + \ + ',' + elem['dim'] + ',' + elem['typ'] + \ + ',' + elem['acc'] + '),\n' + indent + elif elem['type'] == 'op_opt_arg_dat': + line = line + elem['type'] + '(' \ + + elem['opt'] + ',' + elem['dat'] + \ + ',' + elem['idx'] + ',' + elem['map'] + \ + ',' + elem['dim'] + ',' + elem['typ'] + \ + ',' + elem['acc'] + '),\n' + indent + + elif elem['type'] == 'op_arg_gbl': + line = line + elem['type'] + '(' + elem['data'] + \ + ',' + elem['dim'] + ',' + elem['typ'] + \ + ',' + elem['acc'] + '),\n' + indent + + fid.write(line[0:-len(indent) - 2] + ');') + + loc_old = endofcall + 1 + continue + + if locs[loc] in loc_consts: + curr_const = loc_consts.index(locs[loc]) + endofcall = text.find(';', locs[loc]) + name = const_args[curr_const]['name'] + fid.write(indent[0:-2] + 'op_decl_const_'+name.strip()+'(' + + str(const_args[curr_const]['dim']) + ',' + + const_args[curr_const]['type'] + ',' + + const_args[curr_const]['name2'].strip() + ');') + loc_old = endofcall + 1 + continue + + fid.write(text[loc_old:]) + fid.close() + # end of loop over input source files + + ## Loop over kernels, looking for a header file named after each + ## kernel in either working directory or one of the input-supplied + ## directories: + for nk in range(0, len(kernels)): + k_data = kernels[nk] + k_name = k_data["name"] + if not "decl_filepath" in list(k_data.keys()): + src_file = k_name + ".h" + if os.path.isfile(src_file): + with open(src_file, 'r') as f: + text = f.read() + if op_check_kernel_in_text(text, k_name): + k_data["decl_filepath"] = src_file + continue + + for dirname in src_dirs: + filepath = os.path.join(dirname, src_file) + if os.path.isfile(filepath): + with open(filepath, 'r') as f: + text = f.read() + if op_check_kernel_in_text(text, k_name): + k_data["decl_filepath"] = filepath + break + + ## Any kernel declarations still not found must exist in files + ## not named after the kernel. Search through content of all + ## input-supplied files, and through all files of input-supplied + ## directories: + for nk in range(0, len(kernels)): + if not "decl_filepath" in list(kernels[nk].keys()): + k_data = kernels[nk] + k_name = k_data["name"] + + for src_file in src_files: + with open(src_file, 'r') as f: + text = f.read() + if op_check_kernel_in_text(text, k_name): + k_data["decl_filepath"] = src_file + break + + if not "decl_filepath" in list(k_data.keys()): + for src_dir in src_dirs: + for src_dir_subfile in [s for s in os.listdir(src_dir) if os.path.isfile(os.path.join(src_dir, s))]: + src_dir_subfilepath = os.path.join(src_dir, src_dir_subfile) + with open(src_dir_subfilepath, 'r') as f: + text = f.read() + if op_check_kernel_in_text(text, k_name): + k_data["decl_filepath"] = src_dir_subfilepath + break + if "decl_filepath" in list(k_data.keys()): + break + + fail = False + for nk in range(0, len(kernels)): + if not "decl_filepath" in list(kernels[nk].keys()): + fail = True + print(("Declaration not found for kernel " + kernels[nk]["name"])) + if fail: + exit(2) + + # errors and warnings + + if ninit == 0: + print(' ') + print('-----------------------------') + print(' WARNING: no call to op_init') + if auto_soa==1: + print(' WARNING: code generated with OP_AUTO_SOA,\n but couldn\'t modify op_init to pass\n an additional parameter of 1.\n Please make sure OP_AUTO_SOA is set when executing') + print('-----------------------------') + + if nexit == 0: + print(' ') + print('-------------------------------') + print(' WARNING: no call to op_exit ') + print('-------------------------------') + + if npart == 0 and nhdf5 > 0: + print(' ') + print('---------------------------------------------------') + print(' WARNING: hdf5 calls without call to op_partition ') + print('---------------------------------------------------') + + # + # finally, generate target-specific kernel files + # + masterFile = str(srcFilesAndDirs[0]) + + op2_gen_seq(masterFile, date, consts, kernels) # MPI+GENSEQ version - initial version, no vectorisation + # Vec translator is not yet ready for release, eg it cannot translate the 'aero' app. + op2_gen_mpi_vec(masterFile, date, consts, kernels) # MPI+GENSEQ with code that gets auto vectorised with intel compiler (version 15.0 and above) + + #code generators for OpenMP parallelisation with MPI + #op2_gen_openmp(masterFile, date, consts, kernels) # Initial OpenMP code generator + op2_gen_openmp_simple(masterFile, date, consts, kernels) # Simplified and Optimized OpenMP code generator + op2_gen_openacc(masterFile, date, consts, kernels) # Simplified and Optimized OpenMP code generator + + #code generators for NVIDIA GPUs with CUDA + #op2_gen_cuda(masterFile, date, consts, kernels,sets) # Optimized for Fermi GPUs + op2_gen_cuda_simple(masterFile, date, consts, kernels, sets, macro_defs) # Optimized for Kepler GPUs + + # generates openmp code as well as cuda code into the same file + op2_gen_cuda_simple_hyb(masterFile, date, consts, kernels, sets) # CPU and GPU will then do comutations as a hybrid application + + #code generator for GPUs with OpenMP4.5 + op2_gen_openmp4(masterFile, date, consts, kernels) + + # import subprocess + # retcode = subprocess.call("which clang-format > /dev/null", shell=True) + # if retcode == 0: + # retcode = subprocess.call(os.path.dirname(os.path.abspath(__file__))+"/format.sh", shell=True) + # else: + # print 'Cannot find clang-format in PATH' + # print 'Install and add clang-format to PATH to format generated code to conform to code formatting guidelines' + + +if __name__ == '__main__': + # parse the command line arguments (and options) + import getopt + optlist,args = getopt.getopt(sys.argv[1:],'') + # calling the generator + if len(args) > 0: + main(srcFilesAndDirs=args) + # Print usage message if no arguments given + else: + print(__doc__) + sys.exit(1) diff --git a/translator/c/op2_gen_common.py b/translator/c/op2_gen_common.py new file mode 100644 index 000000000..d318257fb --- /dev/null +++ b/translator/c/op2_gen_common.py @@ -0,0 +1,429 @@ +########################################################################## +# +# Common code generation functions +# +# These functions are called from the target-specific code generators +# +########################################################################## + + +import re +import datetime +import glob +import os + +def comment_remover(text): + """Remove comments from text""" + + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return '' + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + +def remove_trailing_w_space(text): + text = text+' ' + line_start = 0 + line = "" + line_end = 0 + striped_test = '' + count = 0 + while 1: + line_end = text.find("\n",line_start+1) + line = text[line_start:line_end] + line = line.rstrip() + striped_test = striped_test + line +'\n' + line_start = line_end + 1 + line = "" + if line_end < 0: + return striped_test[:-1] + +def extract_includes(text): + ## Find all '#includes ...' that are not inside of function declarations. + + includes = [] + + include_pattern = r'([ \t]*#include[\s]+[\'\"<][\w\.]+[\'\">])' + + first_line_pattern = '(^)([^\n]+)' + rest_of_lines_pattern = '([\n])([^\n]+)' + line_pattern = re.compile(first_line_pattern + '|' + rest_of_lines_pattern) + + function_depth = 0 + for match in re.findall(line_pattern, text): + if match[1] != "": + line = match[1] + else: + line = match[3] + + ## Remove noise from the line to improve search for + ## entering and exiting of functions: + line_clean = line + # Remove escaped quotation character: + line_clean = re.sub(r"\\\'", '', line_clean) + line_clean = re.sub(r"\\\"", '', line_clean) + # Remove quoted string in single line: + line_clean = re.sub(r'"[^"]*"', '', line_clean) + line_clean = re.sub(r"'[^']*'", '', line_clean) + # Remove quoted string split over two lines: + line_clean = re.sub(r'"[^"]*\\\n[^"]*"', '', line_clean) + line_clean = re.sub(r"'[^']*\\\n[^']*'", '', line_clean) + # Remove inline scoped logic ( {...} ): + line_clean = re.sub(r'{[^{]*}', '', line_clean) + + function_depth += line_clean.count('{') - line_clean.count('}') + if function_depth != 0: + continue + + match = re.search(include_pattern, line) + if match: + includes.append(line) + + return includes + +def para_parse(text, j, op_b, cl_b): + """Parsing code block, i.e. text to find the correct closing brace""" + + depth = 0 + loc2 = j + + while 1: + if text[loc2] == op_b: + depth = depth + 1 + + elif text[loc2] == cl_b: + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def replace_local_includes_with_file_contents(text, search_dir): + ''' Replace occurences of '#include ""' with contents ''' + include_rgx = r'' + "^([\s]*)" + "#include" + "[\s]+" + '"([\w\.]+)"' + + text2 = '' + for line in text.split('\n'): + if not "#include" in line: + text2 += line+'\n' + else: + include_item_filepath = "" + matches = re.findall(include_rgx, line)[0] + if len(matches) != 2: + text2 += line+'\n' + else: + leading_whitespace = matches[0] + include_item = matches[1] + for r, d, f in os.walk(search_dir): + for f_item in f: + if f_item == include_item: + include_item_filepath = os.path.join(r, f_item) + break + if include_item_filepath != "": + break + if include_item_filepath == "": + print(("Failed to locate file '{0}'".format(include_item))) + quit() + f = open(include_item_filepath, 'r') + include_file_text = f.read() + f.close() + include_file_text = comment_remover(include_file_text) + for line in include_file_text.split('\n'): + text2 += leading_whitespace + line+'\n' + return text2 + +def get_stride_string(g_m,maps,mapnames,name): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + if maps[g_m] == OP_ID: + return 'direct_'+name+'_stride_OP2CONSTANT' + if maps[g_m] == OP_GBL: + return '(gridDim%x*blockDim%x)' + else: + idx = mapnames.index(mapnames[g_m]) + return 'opDat'+str(idx)+'_'+name+'_stride_OP2CONSTANT' + +arithmetic_regex_pattern = r'^[ \(\)\+\-\*\\\.\%0-9]+$' + +def op_parse_macro_defs(text): + """Parsing for C macro definitions""" + + defs = {} + macro_def_pattern = r'(\n|^)[ ]*(#define[ ]+)([A-Za-z0-9\_]+)[ ]+([0-9A-Za-z\_\.\+\-\*\/\(\) ]+)' + for match in re.findall(macro_def_pattern, text): + if len(match) < 4: + continue + elif len(match) > 4: + print(("Unexpected format for macro definition: " + str(match))) + continue + key = match[2] + value = match[3] + defs[key] = value + # print(key + " -> " + value) + return defs + +def self_evaluate_macro_defs(macro_defs): + """Recursively evaluate C macro definitions that refer to other detected macros""" + + ## First, calculate the expected number of substitutions to perform: + num_subs_expected = 0 + for k in list(macro_defs.keys()): + k_val = macro_defs[k] + m = re.search(arithmetic_regex_pattern, k_val) + if m != None: + continue + + pattern = r'' + '([a-zA-Z0-9_]+)' + occurences = re.findall(pattern, k_val) + for o in occurences: + m = re.search(arithmetic_regex_pattern, o) + if m == None: + if o in list(macro_defs.keys()): + num_subs_expected += 1 + + substitutions_performed = True + num_subs_performed = 0 + while substitutions_performed: + substitutions_performed = False + for k in list(macro_defs.keys()): + k_val = macro_defs[k] + m = re.search(arithmetic_regex_pattern, k_val) + if m != None: + ## This macro definiton is numeric + continue + + if k == k_val: + del macro_defs[k] + continue + + # print("Processing '{0}' -> '{1}'".format(k, k_val)) + + ## If value of key 'k' depends on value of other + ## keys, then substitute in value: + for k2 in list(macro_defs.keys()): + if k == k2: + continue + + pattern = r'' + '(^|[^a-zA-Z0-9_])' + k2 + '($|[^a-zA-Z0-9_])' + m = re.search(pattern, k_val) + + if m != None: + ## The macro "k" refers to macro "k2" + k2_val = macro_defs[k2] + + m = re.search(arithmetic_regex_pattern, k2_val) + if m == None: + # 'k2_val' has not been resolved. Wait for this to occur before + # substituting its value into 'k_val', as this minimises the total + # number of substitutions performed across all macros and so + # improves detection of infinite substitution loops. + continue + + macro_defs[k] = re.sub(pattern, "\\g<1>"+k2_val+"\\g<2>", k_val) + # print("- performing a substitution of '" + k2 + "'->'" + k2_val + "' into '" + k_val + "' to produce '" + macro_defs[k] + "'") + k_val = macro_defs[k] + substitutions_performed = True + + num_subs_performed += 1 + if num_subs_performed > num_subs_expected: + print(("WARNING: " + str(num_subs_performed) + " macro substitutions performed, but expected " + str(num_subs_expected) + ", probably stuck in a loop.")) + return + + ## Evaluate any mathematical expressions: + for k in list(macro_defs.keys()): + val = macro_defs[k] + m = re.search(arithmetic_regex_pattern, val) + if m != None: + res = "" + try: + res = eval(val) + except: + pass + if type(res) != type(""): + if str(res) != val: + # print("Replacing '" + val + "' with '" + str(res) + "'") + macro_defs[k] = str(res) + +def evaluate_macro_defs_in_string(macro_defs, string): + """Recursively evaluate C macro definitions in 'string' """ + + ## First, calculate the expected number of substitutions to perform: + num_subs_expected = 0 + m = re.search(arithmetic_regex_pattern, string) + if m == None: + pattern = r'' + '([a-zA-Z0-9_]+)' + occurences = re.findall(pattern, string) + for o in occurences: + m = re.search(arithmetic_regex_pattern, o) + if m == None: + if o in list(macro_defs.keys()): + num_subs_expected = num_subs_expected + 1 + + resolved_string = string + + substitutions_performed = True + num_subs_performed = 0 + while substitutions_performed: + substitutions_performed = False + for k in list(macro_defs.keys()): + k_val = macro_defs[k] + + k_pattern = r'' + r'' + '(^|[^a-zA-Z0-9_])' + k + '($|[^a-zA-Z0-9_])' + m = re.search(k_pattern, resolved_string) + if m != None: + ## "string" contains a reference to macro "k", so substitute in its definition: + resolved_string_new = re.sub(k_pattern, "\\g<1>"+k_val+"\\g<2>", resolved_string) + # print("Performing a substitution of '" + k + "'->'" + k_val + "' into '" + resolved_string + "'' to produce '" + resolved_string_new + "'") + resolved_string = resolved_string_new + substitutions_performed = True + + num_subs_performed = num_subs_performed + 1 + if num_subs_performed > num_subs_expected: + print(("WARNING: " + str(num_subs_performed) + " macro substitutions performed, but expected " + str(num_subs_expected) + ", probably stuck in a loop.")) + return + + + if re.search(arithmetic_regex_pattern, resolved_string) != None: + res = "" + try: + res = eval(resolved_string) + except: + return resolved_string + else: + if type(res) != type(""): + resolved_string = str(res) + + return resolved_string + +def create_kernel_info(kernel, inc_stage = 0): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + name = kernel['name'] + nargs = kernel['nargs'] + dims = kernel['dims'] + maps = kernel['maps'] + var = kernel['var'] + typs = kernel['typs'] + accs = kernel['accs'] + idxs = kernel['idxs'] + inds = kernel['inds'] + soaflags = kernel['soaflags'] + optflags = kernel['optflags'] + decl_filepath = kernel['decl_filepath'] + + ninds = kernel['ninds'] + inddims = kernel['inddims'] + indaccs = kernel['indaccs'] + indtyps = kernel['indtyps'] + invinds = kernel['invinds'] + mapnames = kernel['mapnames'] + invmapinds = kernel['invmapinds'] + mapinds = kernel['mapinds'] + + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + nargs_novec = nargs + + vec = [m for m in range(0,nargs) if int(idxs[m])<0 and maps[m] == OP_MAP] + if len(vec) > 0: + unique_args = [1]; + vec_counter = 1; + vectorised = [] + new_dims = [] + new_maps = [] + new_vars = [] + new_typs = [] + new_accs = [] + new_idxs = [] + new_inds = [] + new_soaflags = [] + new_optflags = [] + new_mapnames = [] + for m in range(0,nargs): + if int(idxs[m])<0 and maps[m] == OP_MAP: + if m > 0: + unique_args = unique_args + [len(new_dims)+1] + temp = [0]*(-1*int(idxs[m])) + for i in range(0,-1*int(idxs[m])): + temp[i] = var[m] + new_vars = new_vars+temp + for i in range(0,-1*int(idxs[m])): + temp[i] = typs[m] + new_typs = new_typs+temp + for i in range(0,-1*int(idxs[m])): + temp[i] = dims[m] + new_dims = new_dims+temp + new_maps = new_maps+[maps[m]]*int(-1*int(idxs[m])) + new_mapnames = new_mapnames+[mapnames[m]]*int(-1*int(idxs[m])) + new_soaflags = new_soaflags+[soaflags[m]]*int(-1*int(idxs[m])) + new_optflags = new_optflags+[optflags[m]]*int(-1*int(idxs[m])) + new_accs = new_accs+[accs[m]]*int(-1*int(idxs[m])) + for i in range(0,-1*int(idxs[m])): + new_idxs = new_idxs+[i] + new_inds = new_inds+[inds[m]]*int(-1*int(idxs[m])) + vectorised = vectorised + [vec_counter]*int(-1*int(idxs[m])) + vec_counter = vec_counter + 1; + else: + if m > 0: + unique_args = unique_args + [len(new_dims)+1] + new_dims = new_dims+[dims[m]] + new_maps = new_maps+[maps[m]] + new_mapnames = new_mapnames+[mapnames[m]] + new_accs = new_accs+[int(accs[m])] + new_soaflags = new_soaflags+[soaflags[m]] + new_optflags = new_optflags+[optflags[m]] + new_idxs = new_idxs+[int(idxs[m])] + new_inds = new_inds+[inds[m]] + new_vars = new_vars+[var[m]] + new_typs = new_typs+[typs[m]] + vectorised = vectorised+[0] + dims = new_dims + maps = new_maps + mapnames = new_mapnames + accs = new_accs + idxs = new_idxs + inds = new_inds + var = new_vars + typs = new_typs + soaflags = new_soaflags; + optflags = new_optflags; + nargs = len(vectorised); + mapinds = [0]*nargs + for i in range(0,nargs): + mapinds[i] = i + for j in range(0,i): + if (maps[i] == OP_MAP) and (mapnames[i] == mapnames[j]) and (idxs[i] == idxs[j]): + mapinds[i] = mapinds[j] + + for i in range(1,ninds+1): + for index in range(0,len(inds)+1): + if inds[index] == i: + invinds[i-1] = index + break + invmapinds = invinds[:] + for i in range(0,ninds): + for j in range(0,i): + if (mapnames[invinds[i]] == mapnames[invinds[j]]): + invmapinds[i] = invmapinds[j] + else: + vectorised = [0]*nargs + unique_args = list(range(1,nargs+1)) + + cumulative_indirect_index = [-1]*nargs; + j = 0; + for i in range (0,nargs): + if maps[i] == OP_MAP and ((not inc_stage) or accs[i] == OP_INC): + cumulative_indirect_index[i] = j + j = j + 1 + + return name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index diff --git a/translator/c/op2_gen_cuda.py b/translator/c/op2_gen_cuda.py new file mode 100644 index 000000000..16f2c8af1 --- /dev/null +++ b/translator/c/op2_gen_cuda.py @@ -0,0 +1,1017 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cu for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import op2_gen_common + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + prefix = ' '*depth + file_text += prefix+rep(text,g_m)+'\n' + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR_INC(i,start,finish,inc): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'+='+inc+' ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_cuda(master, date, consts, kernels, sets): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_RW: + j = i + ind_rw= j >= 0 + if (ind_rw): + print('Error: indirect OP_RW not supported by op2_gen_cuda code generator') + sys.exit(2); + + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ and accs[i] != OP_WRITE: + j = i + reduct = j >= 0 + +########################################################################## +# start with CUDA kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + comm('user function') + + code('__device__') + if FORTRAN: + code('include '+name+'.inc') + elif CPP: + code('#include "../'+name+'.h"') + + comm('') + comm(' CUDA kernel function') + + if FORTRAN: + code('subroutine op_cuda_'+name+'(') + elif CPP: + code('__global__ void op_cuda_'+name+'(') + + depth = 2 + + for g_m in range(0,ninds): + if FORTRAN: + code(' *ind_,') + elif CPP: + code(' *ind_,') + + if ninds>0: + if FORTRAN: + code('int *ind_map,') + code('short *arg_map,') + elif CPP: + code('int *ind_map,') + code('short *arg_map,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL and accs[g_m] == OP_READ: + # declared const for performance + if FORTRAN: + code('const *,') + elif CPP: + code('const *,') + elif maps[g_m]==OP_ID and ninds>0: + if FORTRAN: + code(',') + elif CPP: + code(' *,') + elif maps[g_m]==OP_GBL or maps[g_m]==OP_ID: + if FORTRAN: + code(',') + elif CPP: + code(' *,') + + if ninds>0: + if FORTRAN: + code('int *ind_arg_sizes,') + code('int *ind_arg_offs, ') + code('int block_offset, ') + code('int *blkmap, ') + code('int *offset, ') + code('int *nelems, ') + code('int *ncolors, ') + code('int *colors, ') + code('int nblocks, ') + code('int set_size) { ') + if CPP: + code('int *ind_arg_sizes,') + code('int *ind_arg_offs, ') + code('int block_offset, ') + code('int *blkmap, ') + code('int *offset, ') + code('int *nelems, ') + code('int *ncolors, ') + code('int *colors, ') + code('int nblocks, ') + code('int set_size) { ') + else: + code('int offset_s, ') + code('int set_size ) {') + code('') + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code(' _l[];') + if accs[g_m] == OP_INC: + FOR('d','0','') + code('_l[d]=ZERO_;') + ENDFOR() + else: + FOR('d','0','') + code('_l[d]=[d+blockIdx.x*];') + ENDFOR() + elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + code(' _l[];') + elif (ninds==0 and maps[g_m]==OP_ID and dims[g_m]!='1') and not(soaflags[g_m]): + code(' _l[];') + + for m in range (1,ninds+1): + g_m = m -1 + v = [int(inds[i]==m) for i in range(len(inds))] + v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] + if sum(v)>1 and sum(v_i)>0: #check this sum(v_i) + if indaccs[m-1] == OP_INC: + ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 + code(' *_vec['+str(ind)+'] = {'); depth += 2; + for n in range(0,nargs): + if inds[n] == m: + g_m = n + code('_l,') + depth -= 2 + code('};') + else: + ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 + if indaccs[m-1] == OP_READ: + code('const *_vec['+str(ind)+'];') + else: + code(' *_vec['+str(ind)+'];') +# +# lengthy code for general case with indirection +# + if ninds>0: + code('') + for g_m in range (0,ninds): + code('__shared__ int *ind__map, ind__size;') + for g_m in range (0,ninds): + code('__shared__ *ind__s;') + + if ind_inc: + code('__shared__ int nelems2, ncolor;') + + code('__shared__ int nelem, offset_b;') + code('') + code('extern __shared__ char shared[];') + code('') + IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') + code('return;') + ENDIF() + IF('threadIdx.x==0') + code('') + comm('get sizes and shift pointers and direct-mapped data') + code('') + code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];') + code('') + code('nelem = nelems[blockId];') + code('offset_b = offset[blockId];') + code('') + + if ind_inc: + code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') + code('ncolor = ncolors[blockId];') + code('') + + for g_m in range (0,ninds): + code('ind__size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds)+'];') + + code('') + + for m in range (1,ninds+1): + g_m = m - 1 + c = [i for i in range(len(inds)) if inds[i]==m] + code('ind__map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ + '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds)+'];') + + code('') + comm('set shared memory pointers') + code('int nbytes = 0;') + + for g_m in range(0,ninds): + code('ind__s = ( *) &shared[nbytes];') + if g_m < ninds-1: + code('nbytes += ROUND_UP(ind__size*sizeof()*);') + + ENDIF() + code('__syncthreads(); // make sure all of above completed') + code('') + comm('copy indirect datasets into shared memory or zero increment') + code('') + + for m in range(0,ninds): + g_m = m + if indaccs[m]==OP_READ or indaccs[m]==OP_RW or indaccs[m]==OP_INC: + FOR_INC('n','threadIdx.x','ind__size*','blockDim.x') + if indaccs[m]==OP_READ or indaccs[m]==OP_RW: + code('ind_arg'+str(m)+'_s[n] = ind_arg'+str(m)+'[n%'+inddims[m]+ + '+ind_arg'+str(m)+'_map[n/'+inddims[m]+']*'+inddims[m]+'];') + code('') + elif indaccs[m]==OP_INC: + code('ind__s[n] = ZERO_;') + ENDFOR() + + code('') + code('__syncthreads();') + comm('process set elements') + code('') + + if ind_inc: + FOR_INC('n','threadIdx.x','nelems2','blockDim.x') + code('int col2 = -1;') + IF('n') + code('_l[d] = ZERO_;') + ENDFOR() + else: + FOR_INC('n','threadIdx.x','nelem','blockDim.x') + +# +# simple alternative when no indirection +# + else: + use_shared = 0; + for m in range(0,nargs): + if maps[m]!=OP_GBL and dims[m]!='1': + use_shared = 1 + + if use_shared: + code('int tid = threadIdx.x%OP_WARPSIZE;') + code('') + code('extern __shared__ char shared[];') + code('char *arg_s = shared + offset_s*(threadIdx.x/OP_WARPSIZE);') + + code('') + comm('process set elements') + FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x') + + if use_shared: + code('int offset = n - tid;') + code('int nelems = MIN(OP_WARPSIZE,set_size-offset);') + comm('copy data into shared memory, then into local') + + for m in range(0,nargs): + g_m = m + if (maps[m]!=OP_GBL and accs[m]!=OP_WRITE and dims[m]!='1') and not(soaflags[m]): + FOR('m','0','') + code('(( *)arg_s)[tid+m*nelems] = [tid+m*nelems+offset*];') + ENDFOR() + code('') + FOR('m','0','') + code('_l[m] = (( *)arg_s)[m+tid*];') + ENDFOR() + code('') + + + +# +# kernel call +# + + # xxx: array of pointers for non-locals + for m in range(1,ninds+1): + s = [i for i in range(len(inds)) if inds[i]==m] + if sum(s)>1: + if indaccs[m-1] != OP_INC: + code('') + ctr = 0 + for n in range(0,nargs): + if inds[n] == m and vectorised[n]: + code('arg'+str(m-1)+'_vec['+str(ctr)+'] = ind_arg'+\ + str(inds[n]-1)+'_s+arg_map['+str(cumulative_indirect_index[n])+\ + '*set_size+n+offset_b]*'+str(dims[n])+';') + ctr = ctr+1 + + code('') + comm('user-supplied kernel call') + + line = name+'(' + prefix = ' '*len(name) + a = 0 #only apply indentation if its not the 0th argument + indent ='' + for m in range (0, nargs): + if a > 0: + indent = ' '+' '*len(name) + + if maps[m] == OP_GBL: + if accs[m] == OP_READ or accs[m] == OP_WRITE: + line += rep(indent+',\n',m) + else: + line += rep(indent+'_l,\n',m); + a =a+1 + elif maps[m]==OP_MAP and accs[m]==OP_INC and vectorised[m]==0: + line += rep(indent+'_l,\n',m) + a =a+1 + elif maps[m]==OP_MAP and vectorised[m]==0: + line += rep(indent+'ind_arg'+str(inds[m]-1)+'_s+arg_map['+\ + str(cumulative_indirect_index[m])+'*set_size+n+offset_b]*,'+'\n',m) + a =a+1 + elif maps[m]==OP_MAP and m == 0: + line += rep(indent+'_vec,'+'\n',inds[m]-1) + a =a+1 + elif maps[m]==OP_MAP and m>0 and vectorised[m] != vectorised[m-1]: #xxx:vector + line += rep(indent+'_vec,'+'\n',inds[m]-1) + a =a+1 + elif maps[m]==OP_MAP and m>0 and vectorised[m] == vectorised[m-1]: + line = line + a =a+1 + elif maps[m]==OP_ID: + if ninds>0: + if soaflags[m]: + line += rep(indent+'+(n+offset_b),\n',m) + else: + line += rep(indent+'+(n+offset_b)*,\n',m) + a =a+1 + else: + if dims[m] == '1' or soaflags[m]: + line += rep(indent+'+n,\n',m) + else: + line += rep(indent+'_l,\n',m) + a =a+1 + else: + print('internal error 1 ') + + code(line[0:-2]+');') #remove final ',' and \n + +# +# updating for indirect kernels ... +# + if ninds>0: + if ind_inc: + code('col2 = colors[n+offset_b];') + ENDIF() + code('') + comm('store local variables') + code('') + + for g_m in range(0,nargs): + if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + code('int _map;') + + IF('col2>=0') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + code('_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];') + + ENDIF() + code('') + FOR('col','0','ncolor') + IF('col2==col') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + FOR('d','0','') + code('ind_arg'+str(inds[g_m]-1)+'_s[d+_map*] += _l[d];') + ENDFOR() + + ENDFOR() + code('__syncthreads();') + ENDFOR() + ENDFOR() + + s = [i for i in range(1,ninds+1) if indaccs[i-1]!= OP_READ] + + if len(s)>0 and max(s)>0: + code('') + comm('apply pointered write/increment') + + for g_m in range(0,ninds): + if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW or indaccs[g_m]==OP_INC: + FOR_INC('n','threadIdx.x','_size*','blockDim.x') + if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW: + code('[n%+_map[n/]*] = _s[n];') + elif indaccs[g_m]==OP_INC: + code('[n%+_map[n/]*] += _s[n];') + ENDFOR() +# +# ... and direct kernels +# + else: + if use_shared: + comm('copy back into shared memory, then to device') + for m in range(0,nargs): + g_m = m + if (maps[m]!=OP_GBL and accs[m]!=OP_READ and dims[m]!='1') and not(soaflags[m]): + code('') + FOR('m','0','') + code('(( *)arg_s)[m+tid*] = _l[m];') + ENDFOR() + FOR('m','0','') + code('[tid+m*nelems+offset*] = (( *)arg_s)[tid+m*nelems];') + ENDFOR() + + depth -= 2 + code('}') + +# +# global reduction +# + if reduct: + code('') + comm('global reductions') + code('') + for m in range (0,nargs): + g_m = m + if maps[m]==OP_GBL and accs[m]!=OP_READ and accs[m] != OP_WRITE: + FOR('d','0','') + if accs[m]==OP_INC: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + elif accs[m]==OP_MIN: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + elif accs[m]==OP_MAX: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + else: + print('internal error: invalid reduction option') + sys.exit(2); + ENDFOR() + depth -= 2 + code('}') + code('') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm('host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){') + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + #print vectorised + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# indirect bits +# + if ninds>0: + code('') + code('int ninds = '+str(ninds)+';') + line = ' int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + + code('') + comm('get plan') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + + IF('set_size > 0') + code('') + code('op_timing_realloc('+str(nk)+');') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('') + if any_soa: + code('int op2_stride_internal = set->size + set->exec_size + set->nonexec_size;') + #code('op_decl_const_char(1, "int", sizeof(int), (char *)&op2_stride, "op2_stride");') + code('cutilSafeCall(cudaMemcpyToSymbol(op2_stride , &op2_stride_internal, sizeof(int)));'); + code('') + +# +# kernel call for indirect version +# + if ninds>0: + code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);') + code('') + + +# +# transfer constants +# + g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)] + if len(g)>0: + comm('transfer constants to GPU') + code('int consts_bytes = 0;') + for m in range(0,nargs): + g_m = m + if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m]==OP_WRITE): + code('consts_bytes += ROUND_UP(*sizeof());') + + code('reallocConstArrays(consts_bytes);') + code('consts_bytes = 0;') + + for m in range(0,nargs): + if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m]==OP_WRITE): + g_m = m + code('.data = OP_consts_h + consts_bytes;') + code('.data_d = OP_consts_d + consts_bytes;') + FOR('d','0','') + code('(( *).data)[d] = h[d];') + ENDFOR() + code('consts_bytes += ROUND_UP(*sizeof());') + code('mvConstArraysToDevice(consts_bytes);') + code('') + + +# +# transfer global reduction initial data +# + + if ninds == 0: + comm('set CUDA execution parameters') + code('#ifdef OP_BLOCK_SIZE_'+str(nk)) + code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') + code('#else') + comm(' int nthread = OP_block_size;') + code(' int nthread = 128;') + code('#endif') + code('') + code('int nblocks = 200;') + code('') + + if reduct: + comm('transfer global reduction data to GPU') + if ninds>0: + code('int maxblocks = 0;') + FOR('col','0','Plan->ncolors') + code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') + ENDFOR() + else: + code('int maxblocks = nblocks;') + + code('int reduct_bytes = 0;') + code('int reduct_size = 0;') + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code('reduct_bytes += ROUND_UP(maxblocks**sizeof());') + code('reduct_size = MAX(reduct_size,sizeof());') + + code('reallocReductArrays(reduct_bytes);') + code('reduct_bytes = 0;') + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code('.data = OP_reduct_h + reduct_bytes;') + code('.data_d = OP_reduct_d + reduct_bytes;') + FOR('b','0','maxblocks') + FOR('d','0','') + if accs[g_m]==OP_INC: + code('(( *).data)[d+b*] = ZERO_;') + else: + code('(( *).data)[d+b*] = h[d];') + ENDFOR() + ENDFOR() + code('reduct_bytes += ROUND_UP(maxblocks**sizeof());') + code('mvReductArraysToDevice(reduct_bytes);') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + comm('execute plan') + code('') + code('int block_offset = 0;') + FOR('col','0','Plan->ncolors') + IF('col==Plan->ncolors_core') + code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + code('#ifdef OP_BLOCK_SIZE_'+str(nk)) + code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';') + code('#else') + code('int nthread = OP_block_size;') + code('#endif') + code('') + code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],') + code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);') + IF('Plan->ncolblk[col] > 0') + + if reduct: + code('int nshared = MAX(Plan->nshared,reduct_size*nthread);') + else: + code('int nshared = Plan->nsharedCol[col];') + + code('op_cuda_'+name+'<<>>(') + + for m in range(1,ninds+1): + g_m = invinds[m-1] + code('( *).data_d,') + + code('Plan->ind_map,') + code('Plan->loc_map,') + + for g_m in range(0,nargs): + if inds[g_m]==0: + code('(*).data_d,') + + + code('Plan->ind_sizes,') + code('Plan->ind_offs,') + code('block_offset,') + code('Plan->blkmap,') + code('Plan->offset,') + code('Plan->nelems,') + code('Plan->nthrcol,') + code('Plan->thrcol,') + code('Plan->ncolblk[col],') + code('set_size);') + code('') + if reduct: + comm('transfer global reduction data back to CPU') + IF('col == Plan->ncolors_owned-1') + code('mvReductArraysToHost(reduct_bytes);') + ENDIF() + + ENDFOR() + code('block_offset += Plan->ncolblk[col];') + ENDIF() +# +# kernel call for direct version +# + else: + comm('work out shared memory requirements per element') + code('') + code('int nshared = 0;') + + for g_m in range(0,nargs): + if maps[g_m]!=OP_GBL and dims[g_m]!='1': + code('nshared = MAX(nshared,sizeof()*);') + + code('') + comm('execute plan') + code('int offset_s = nshared*OP_WARPSIZE;') + code('') + + if reduct: + code('nshared = MAX(nshared*nthread,reduct_size*nthread);') + else: + code('nshared = nshared*nthread;') + + code('op_cuda_'+name+'<<>>(') + + indent = ' '#*(len(name)+42) + for g_m in range(0,nargs): + if g_m > 0: + code(indent+'( *) .data_d,') + else: + code(indent+'( *) .data_d,') + + code(indent+'offset_s,') + code(indent+'set->size );') + + if ninds>0: + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + +# +# transfer global reduction initial data +# + if reduct: + if ninds == 0: + comm('transfer global reduction data back to CPU') + code('mvReductArraysToHost(reduct_bytes);') + + for m in range(0,nargs): + g_m = m + if maps[m]==OP_GBL and accs[m]!=OP_READ and accs[m]!=OP_WRITE: + FOR('b','0','maxblocks') + FOR('d','0','') + if accs[m]==OP_INC: + code('h[d] = h[d] + (( *).data)[d+b*];') + elif accs[m]==OP_MIN: + code('h[d] = MIN(h[d],(( *).data)[d+b*]);') + elif accs[m]==OP_MAX: + code('h[d] = MAX(h[d],(( *).data)[d+b*]);') + ENDFOR() + ENDFOR() + + code('.data = (char *)h;') + code('op_mpi_reduce(&,h);') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('mvConstArraysToHost(consts_bytes);') + break + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + FOR('d','0','') + code('h[d] = (( *).data)[d];') + ENDFOR() + code('.data = (char *)h;') + code('op_mpi_reduce(&,h);') + + ENDIF() + code('op_mpi_set_dirtybit_cuda(nargs, args);') + +# +# update kernel record +# + + comm('update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ or accs[g_m]==OP_WRITE: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + + depth = depth - 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('cuda'): + os.makedirs('cuda') + fid = open('cuda/'+name+'_kernel.cu','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text = '' + + comm('global constants') + + code('#ifndef MAX_CONST_SIZE') + code('#define MAX_CONST_SIZE 128') + code('#endif') + code('') + + for nc in range (0,len(consts)): + if consts[nc]['dim']==1: + code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'] > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + + code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm('header') + if os.path.exists('./user_types.h'): + code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif') + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h"') + code('#include "op_cuda_rt_support.h"') + code('#include "op_cuda_reduction.h"') + code('') + + # if any_soa: + # code('__constant__ int op2_stride;') + # code('') + # code('#define OP2_STRIDE(arr, idx) arr[op2_stride*(idx)]') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + depth = depth + 2 + code('if (!OP_hybrid_gpu) return;') + if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1: + IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE') + code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);') + ENDIF() + code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));') + depth = depth - 2 + code('}') + + code('') + comm('user kernel files') + + for nk in range(0,len(kernels)): + file_text = file_text +\ + '#include "'+kernels[nk]['name']+'_kernel.cu"\n' + + master = master.split('.')[0] + fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w') + fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write(file_text) + fid.close() + + + diff --git a/translator/c/op2_gen_cuda_simple.py b/translator/c/op2_gen_cuda_simple.py new file mode 100644 index 000000000..436f1537e --- /dev/null +++ b/translator/c/op2_gen_cuda_simple.py @@ -0,0 +1,1339 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cu for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import glob +import os +import op2_gen_common + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR_INC(i,start,finish,inc): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'+='+inc+' ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_cuda_simple(master, date, consts, kernels,sets, macro_defs): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + +#Optimization settings + inc_stage=0 + op_color2_force=1 + atomics=0 + + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk], inc_stage) + + + any_soa = 0 + any_soa = any_soa or sum(soaflags) + op_color2=0 +# +# set logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_RW: + j = i + ind_rw = j >= 0 + + if atomics and ind_rw: + atomics = 0 + + if ind_rw or op_color2_force: + op_color2 = 1 + else: + op_color2 = 0 + + #no staging with 2 level colouring + if op_color2: + inc_stage=0 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ and accs[i] != OP_WRITE: + j = i + reduct = j >= 0 + + if inc_stage: + ninds_staged = 0 + inds_staged = [-1]*nargs + for i in range(0,nargs): + if maps[i]==OP_MAP and accs[i]==OP_INC: + if inds_staged[invinds[inds[i]-1]] == -1: + inds_staged[i] = ninds_staged + ninds_staged = ninds_staged + 1 + else: + inds_staged[i] = inds_staged[invinds[inds[i]-1]] + invinds_staged = [-1]*ninds_staged + inddims_staged = [-1]*ninds_staged + indopts_staged = [-1]*ninds_staged + for i in range(0,nargs): + if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: + invinds_staged[inds_staged[i]] = i + inddims_staged[inds_staged[i]] = dims[i] + if optflags[i] == 1: + indopts_staged[inds_staged[i]] = i + for i in range(0,nargs): + inds_staged[i] = inds_staged[i] + 1 + +########################################################################## +# start with CUDA kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('__constant__ int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') + code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('__constant__ int direct_'+name+'_stride_OP2CONSTANT;') + code('int direct_'+name+'_stride_OP2HOST=-1;') + dir_soa = g_m + break + + file_name = decl_filepath + + f = open(file_name, 'r') + kernel_text = f.read() + f.close() + + if CPP: + includes = op2_gen_common.extract_includes(kernel_text) + if len(includes) > 0: + for include in includes: + code(include) + code("") + + comm('user function') + + kernel_text = op2_gen_common.comment_remover(kernel_text) + kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) + + p = re.compile('void\\s+\\b'+name+'\\b') + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + + #i = kernel_text[0:i].rfind('\n') #reverse find + j = kernel_text[i:].find('{') + k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') + signature_text = kernel_text[i:i+j] + l = signature_text[0:].find('(') + head_text = signature_text[0:l].strip() #save function name + m = op2_gen_common.para_parse(signature_text, 0, '(', ')') + signature_text = signature_text[l+1:m] + body_text = kernel_text[i+j+1:k] + + ## Replace occurrences of '#include ""' within loop with the contents of : + body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) + + # check for number of arguments + if len(signature_text.split(',')) != nargs_novec: + print('Error parsing user kernel('+name+'): must have '+str(nargs)+' arguments') + return + + for i in range(0,nargs_novec): + var = signature_text.split(',')[i].strip() + if kernels[nk]['soaflags'][i] and (op_color2 or not (kernels[nk]['maps'][i] == OP_MAP and kernels[nk]['accs'][i] == OP_INC)): + var = var.replace('*','') + #locate var in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var)) + var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + + if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: + body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'\1[(\2)*'+op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + else: + body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) + body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9_]*)\]'+'', var2+r'[(\1)*'+ \ + op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + + for nc in range(0,len(consts)): + varname = consts[nc]['name'] + body_text = re.sub('\\b'+varname+'\\b', varname+'_cuda',body_text) + + signature_text = '__device__ '+head_text + '_gpu( '+signature_text + ') {' + file_text += signature_text + body_text + '}\n' + + comm('') + comm(' CUDA kernel function') + + if FORTRAN: + code('subroutine op_cuda_'+name+'(') + elif CPP: + code('__global__ void op_cuda_'+name+'(') + + depth = 2 + + if nopts > 0: + code('int optflags,') + + for g_m in range(0,ninds): + if (indaccs[g_m]==OP_READ): + code('const *__restrict ,') + else: + code(' *__restrict ,') + + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('const int *__restrict opDat'+str(invinds[inds[g_m]-1])+'Map, ') + + + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if accs[g_m] == OP_READ: + code('const *__restrict ,') + else: + code(' *,') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE: + code(' *,') + elif accs[g_m] == OP_READ: + code('const *,') + + if ind_inc and inc_stage==1: + code('int *ind_map,') + code('short *arg_map,') + code('int *ind_arg_sizes,') + code('int *ind_arg_offs, ') + + if ninds>0: + if op_color2: + code('int start, ') + code('int end, ') + code('int *col_reord, ') + elif not atomics: + code('int block_offset, ') + code('int *blkmap, ') + code('int *offset, ') + code('int *nelems, ') + code('int *ncolors, ') + code('int *colors, ') + code('int nblocks, ') + else: + code('int start, ') + code('int end, ') + code('int set_size) { ') + else: + code('int set_size ) {') + code('') + + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE: + code(' _l[];') + if accs[g_m] == OP_INC: + FOR('d','0','') + code('_l[d]=ZERO_;') + ENDFOR() + else: + FOR('d','0','') + code('_l[d]=[d+blockIdx.x*];') + ENDFOR() + elif maps[g_m]==OP_MAP and accs[g_m]==OP_INC and not op_color2 and not atomics: + code(' _l[];') + + if not op_color2 and not atomics: + for m in range (1,ninds+1): + g_m = m -1 + v = [int(inds[i]==m) for i in range(len(inds))] + v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] + if sum(v)>1 and sum(v_i)>0: #check this sum(v_i) + if indaccs[m-1] == OP_INC: + ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 + code(' *arg'+str(invinds[m-1])+'_vec['+str(ind)+'] = {'); depth += 2; + for n in range(0,nargs): + if inds[n] == m: + g_m = n + code('_l,') + depth -= 2 + code('};') +# +# lengthy code for general case with indirection +# + if ninds>0 and not op_color2 and not atomics: + code('') + if inc_stage==1: + for g_m in range (0,ninds): + if indaccs[g_m] == OP_INC: + code('__shared__ int *_map, _size;') + code('__shared__ *_s;') + code('') + if ind_inc: + code('__shared__ int nelems2, ncolor;') + + code('__shared__ int nelem, offset_b;') + code('') + code('extern __shared__ char shared[];') + code('') + IF('blockIdx.x+blockIdx.y*gridDim.x >= nblocks') + code('return;') + ENDIF() + IF('threadIdx.x==0') + code('') + comm('get sizes and shift pointers and direct-mapped data') + code('') + code('int blockId = blkmap[blockIdx.x + blockIdx.y*gridDim.x + block_offset];') + code('') + code('nelem = nelems[blockId];') + code('offset_b = offset[blockId];') + code('') + + if ind_inc: + code('nelems2 = blockDim.x*(1+(nelem-1)/blockDim.x);') + code('ncolor = ncolors[blockId];') + code('') + + if inc_stage==1 and ind_inc: + for g_m in range (0,ninds_staged): + if indopts_staged[g_m-1] > 0: + IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) + code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds_staged)+'];') + if indopts_staged[g_m-1] > 0: + ENDIF() + + code('') + for m in range (1,ninds_staged+1): + g_m = m - 1 + c = [i for i in range(nargs) if inds_staged[i]==m] + code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ + '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds_staged)+'];') + + code('') + comm('set shared memory pointers') + code('int nbytes = 0;') + + for g_m in range(0,ninds_staged): + code('ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_s = ('+typs[invinds_staged[g_m]]+' *) &shared[nbytes];') + if g_m < ninds_staged-1: + if indopts_staged[g_m-1] > 0: + IF('optflags & 1<<'+str(optidxs[indopts_staged[g_m-1]])) + code('nbytes += ROUND_UP(ind_arg'+str(inds[invinds_staged[g_m]]-1)+'_size*sizeof('+typs[invinds_staged[g_m]]+')*'+dims[invinds_staged[g_m]]+');') + if indopts_staged[g_m-1] > 0: + ENDIF() + + + ENDIF() + code('__syncthreads(); // make sure all of above completed') + code('') + + if inc_stage==1: + for g_m in range(0,ninds): + if indaccs[g_m] == OP_INC: + FOR_INC('n','threadIdx.x','_size*','blockDim.x') + code('_s[n] = ZERO_;') + ENDFOR() + if ind_inc: + code('') + code('__syncthreads();') + code('') + + if ind_inc: + FOR_INC('n','threadIdx.x','nelems2','blockDim.x') + code('int col2 = -1;') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + IF('n') + code('_l[d] = ZERO_;') + ENDFOR() + else: + FOR_INC('n','threadIdx.x','nelem','blockDim.x') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + + #non-optional maps + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') + + #whatever didn't come up and is opt + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]==1: + IF('optflags & 1<<'+str(optidxs[g_m])) + else: + k = k + [mapinds[g_m]] + + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + offset_b + set_size * '+str(int(idxs[g_m]))+'];') + if optflags[g_m]==1: + ENDIF() + + code('') + for g_m in range (0,nargs): + if accs[g_m] != OP_INC: #TODO: add opt handling here + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + if soaflags[g_m]: + line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' + else: + line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) +# +# simple version for atomics/global coloring +# + elif ninds>0: + code('int tid = threadIdx.x + blockIdx.x * blockDim.x;') + IF('tid + start < end') + if atomics: + code('int n = tid + start;') + else: + code('int n = col_reord[tid + start];') + comm('initialise local variables') + + for g_m in range(0,nargs): + if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + code(' _l[];') + FOR('d','0','') + code('_l[d] = ZERO_;') + ENDFOR() + + #mapidx declarations + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + + #non-optional maps + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not optflags[g_m]) and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] #non-opt + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') + + #whatever didn't come up and is opt + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]==1: + IF('optflags & 1<<'+str(optidxs[g_m])) + else: + k = k + [mapinds[g_m]] + + code('map'+str(mapinds[g_m])+'idx = opDat'+str(invmapinds[inds[g_m]-1])+'Map[n + set_size * '+str(int(idxs[g_m]))+'];') + if optflags[g_m]==1: + ENDIF() + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + if atomics and accs[g_m] == OP_INC: + indent = ' '*(depth+2) + for n in range(0,nargs): + if vectorised[n] == vectorised[g_m]: + line = line + indent + 'arg'+str(n)+'_l,\n' + line = line[:-2]+'};' + code(line) + else: + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + if soaflags[g_m]: + line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[map'+str(mapinds[g_m+k])+'idx],\n' + else: + line = line + indent + ' &ind_arg'+str(inds[first]-1)+'[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) + + + + +# +# simple alternative when no indirection +# + else: + code('') + comm('process set elements') + if reduct: + FOR_INC('n','threadIdx.x+blockIdx.x*blockDim.x','set_size','blockDim.x*gridDim.x') + else: + code('int n = threadIdx.x+blockIdx.x*blockDim.x;') + IF('n < set_size') + +# +# kernel call +# + code('') + comm('user-supplied kernel call') + line = name+'_gpu(' + prefix = ' '*len(name) + a = 0 #only apply indentation if its not the 0th argument + indent ='' + for m in range (0, nargs): + if a > 0: + indent = ' '+' '*len(name) + + if maps[m] == OP_GBL: + if accs[m] == OP_READ or accs[m] == OP_WRITE: + line += rep(indent+',\n',m) + else: + line += rep(indent+'_l,\n',m); + a =a+1 + elif maps[m]==OP_MAP and accs[m]==OP_INC and not op_color2: + if vectorised[m]: + if m+1 in unique_args: + line += rep(indent+'_vec,\n',m) + else: + line += rep(indent+'_l,\n',m) + a =a+1 + elif maps[m]==OP_MAP: + if vectorised[m]: + if m+1 in unique_args: + line += rep(indent+'_vec,\n',m) + else: + if soaflags[m]: + line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx,'+'\n',m) + else: + line += rep(indent+'ind_arg'+str(inds[m]-1)+'+map'+str(mapinds[m])+'idx*,'+'\n',m) + a =a+1 + elif maps[m]==OP_ID: + if ninds>0 and not op_color2 and not atomics: + if soaflags[m]: + line += rep(indent+'+(n+offset_b),\n',m) + else: + line += rep(indent+'+(n+offset_b)*,\n',m) + a =a+1 + else: + if soaflags[m]: + line += rep(indent+'+n,\n',m) + else: + line += rep(indent+'+n*,\n',m) + a =a+1 + else: + print('internal error 1 ') + + code(line[0:-2]+');') #remove final ',' and \n + +# +# updating for indirect kernels ... +# + if ninds>0 and not op_color2 and not atomics: + if ind_inc: + code('col2 = colors[n+offset_b];') + ENDIF() + code('') + comm('store local variables') + code('') + if inc_stage==1: + for g_m in range(0,nargs): + if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + code('int _map;') + IF('col2>=0') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + code('_map = arg_map['+str(cumulative_indirect_index[g_m])+'*set_size+n+offset_b];') + ENDIF() + code('') + + FOR('col','0','ncolor') + IF('col2==col') + + if inc_stage==1: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if optflags[g_m]==1: + IF('optflags & 1<<'+str(optidxs[g_m])) + for d in range(0,int(dims[g_m])): + if soaflags[g_m]: + code('_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s[_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size];') + else: + code('_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+_map*];') +# for g_m in range(0,nargs): +# if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + for d in range(0,int(dims[g_m])): + if soaflags[g_m]: + code('ind_arg'+str(inds[g_m]-1)+'_s[_map+'+str(d)+'*ind_arg'+str(inds[g_m]-1)+'_size] = _l['+str(d)+'];') + else: + code('ind_arg'+str(inds[g_m]-1)+'_s['+str(d)+'+_map*] = _l['+str(d)+'];') + + if optflags[g_m]==1: + ENDIF() + else: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if optflags[g_m]==1: + IF('optflags & 1<<'+str(optidxs[g_m])) + for d in range(0,int(dims[g_m])): + if soaflags[g_m]: + code('_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx];') + else: + code('_l['+str(d)+'] += ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*];') +# for g_m in range(0,nargs): +# if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + for d in range(0,int(dims[g_m])): + if soaflags[g_m]: + code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx] = _l['+str(d)+'];') + else: + code('ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*] = _l['+str(d)+'];') + if optflags[g_m]==1: + ENDIF() + + ENDFOR() + code('__syncthreads();') + ENDFOR() + if ninds>0 and atomics: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if optflags[g_m]==1: + IF('optflags & 1<<'+str(optidxs[g_m])) + for d in range(0,int(dims[g_m])): + if soaflags[g_m]: + code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'*'+op2_gen_common.get_stride_string(g_m,maps,mapnames,name)+'+map'+str(mapinds[g_m])+'idx],_l['+str(d)+']);') + else: + code('atomicAdd(&ind_arg'+str(inds[g_m]-1)+'['+str(d)+'+map'+str(mapinds[g_m])+'idx*],_l['+str(d)+']);') + if optflags[g_m]==1: + ENDIF() + + + ENDFOR() + + if inc_stage: + for g_m in range(0,ninds): + if indaccs[g_m]==OP_INC: + if indopts[g_m] > 0: + IF('optflags & 1<<'+str(optidxs[indopts[g_m-1]])) + if soaflags[invinds[g_m]]: + FOR_INC('n','threadIdx.x','_size','blockDim.x') + for d in range(0,int(dims[invinds[g_m]])): + code('arg'+str(invinds[g_m])+'_l['+str(d)+'] = _s[n+'+str(d)+'*_size] + [_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(invinds[g_m],maps,mapnames,name)+'];') + for d in range(0,int(dims[invinds[g_m]])): + code('[_map[n]+'+str(d)+'*'+op2_gen_common.get_stride_string(invinds[g_m],maps,mapnames,name)+'] = arg'+str(invinds[g_m])+'_l['+str(d)+'];') + ENDFOR() + else: + FOR_INC('n','threadIdx.x','_size*','blockDim.x') + code('[n%+_map[n/]*] += _s[n];') + ENDFOR() + if indopts[g_m] > 0: + ENDIF() + +# +# global reduction +# + if reduct: + code('') + comm('global reductions') + code('') + for m in range (0,nargs): + g_m = m + if maps[m]==OP_GBL and accs[m]!=OP_READ and accs[m] != OP_WRITE: + FOR('d','0','') + if accs[m]==OP_INC: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + elif accs[m]==OP_MIN: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + elif accs[m]==OP_MAX: + code('op_reduction(&[d+blockIdx.x*],_l[d]);') + else: + print('internal error: invalid reduction option') + sys.exit(2); + ENDFOR() + depth -= 2 + code('}') + code('') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm('host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){') + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + + if nopts>0: + code('int optflags = 0;') + for i in range(0,nargs): + if optflags[i] == 1: + IF('args['+str(i)+'].opt') + code('optflags |= 1<<'+str(optidxs[i])+';') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('') + +# +# indirect bits +# + if ninds>0: + code('') + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + + if not atomics: + code('') + comm('get plan') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + #code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + code('int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 2);') + #code('op_mpi_halo_exchanges_cuda(set, nargs, args);') + + IF('set_size > 0') + code('') + +# +# kernel call for indirect version +# + if ninds>0 and not atomics: + if inc_stage==1 and ind_inc: + code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_INC);') + elif op_color2: + code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') + else: + code('op_plan *Plan = op_plan_get(name,set,part_size,nargs,args,ninds,inds);') + code('') + + +# +# transfer constants +# + g = [i for i in range(0,nargs) if maps[i] == OP_GBL and (accs[i] == OP_READ or accs[i] == OP_WRITE)] + if len(g)>0: + comm('transfer constants to GPU') + code('int consts_bytes = 0;') + for m in range(0,nargs): + g_m = m + if maps[m]==OP_GBL and (accs[m]==OP_READ or accs[m] == OP_WRITE): + code('consts_bytes += ROUND_UP(*sizeof());') + + code('reallocConstArrays(consts_bytes);') + code('consts_bytes = 0;') + + for m in range(0,nargs): + if maps[m]==OP_GBL and (accs[m] == OP_READ or accs[m] == OP_WRITE): + g_m = m + code('.data = OP_consts_h + consts_bytes;') + code('.data_d = OP_consts_d + consts_bytes;') + FOR('d','0','') + code('(( *).data)[d] = h[d];') + ENDFOR() + code('consts_bytes += ROUND_UP(*sizeof());') + code('mvConstArraysToDevice(consts_bytes);') + code('') + + #managing constants + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') + code('cudaMemcpyToSymbol(opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT, &opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST,sizeof(int));') + ENDIF() + if dir_soa!=-1: + IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') + code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') + code('cudaMemcpyToSymbol(direct_'+name+'_stride_OP2CONSTANT,&direct_'+name+'_stride_OP2HOST,sizeof(int));') + ENDIF() + +# +# transfer global reduction initial data +# + + if ninds == 0 or atomics or op_color2: + comm('set CUDA execution parameters') + code('#ifdef OP_BLOCK_SIZE_'+str(nk)) + code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') + code('#else') + code(' int nthread = OP_block_size;') + code('#endif') + code('') + if ninds==0: + if reduct: + code('int nblocks = 400;') + else: + code('int nblocks = (set_size - 1) / nthread + 1;') + code('') + + if reduct: + comm('transfer global reduction data to GPU') + if ninds>0 and not atomics: + code('int maxblocks = 0;') + if op_color2: + FOR('col','0','Plan->ncolors') + code('int start = Plan->col_offsets[0][col];') + code('int end = Plan->col_offsets[0][col+1];') + code('int nblocks = (end - start - 1)/nthread + 1;') + code('maxblocks = MAX(maxblocks,nblocks);') + ENDFOR() + else: + FOR('col','0','Plan->ncolors') + code('maxblocks = MAX(maxblocks,Plan->ncolblk[col]);') + ENDFOR() + elif atomics and ninds>0: + code('int maxblocks = (MAX(set->core_size, set->size+set->exec_size-set->core_size)-1)/nthread+1;') + else: + code('int maxblocks = nblocks;') + + code('int reduct_bytes = 0;') + code('int reduct_size = 0;') + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code('reduct_bytes += ROUND_UP(maxblocks**sizeof());') + code('reduct_size = MAX(reduct_size,sizeof());') + + code('reallocReductArrays(reduct_bytes);') + code('reduct_bytes = 0;') + + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code('.data = OP_reduct_h + reduct_bytes;') + code('.data_d = OP_reduct_d + reduct_bytes;') + FOR('b','0','maxblocks') + FOR('d','0','') + if accs[g_m]==OP_INC: + code('(( *).data)[d+b*] = ZERO_;') + else: + code('(( *).data)[d+b*] = h[d];') + ENDFOR() + ENDFOR() + code('reduct_bytes += ROUND_UP(maxblocks**sizeof());') + code('mvReductArraysToDevice(reduct_bytes);') + code('') + +# +# kernel call for indirect version +# + if ninds>0 and not atomics: + comm('execute plan') + if not op_color2: + code('') + code('int block_offset = 0;') + FOR('col','0','Plan->ncolors') + IF('col==Plan->ncolors_core') + code('op_mpi_wait_all_grouped(nargs, args, 2);') + #code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + if not op_color2: + code('#ifdef OP_BLOCK_SIZE_'+str(nk)) + code('int nthread = OP_BLOCK_SIZE_'+str(nk)+';') + code('#else') + code('int nthread = OP_block_size;') + code('#endif') + code('') + if op_color2: + code('int start = Plan->col_offsets[0][col];') + code('int end = Plan->col_offsets[0][col+1];') + code('int nblocks = (end - start - 1)/nthread + 1;') + else: + code('dim3 nblocks = dim3(Plan->ncolblk[col] >= (1<<16) ? 65535 : Plan->ncolblk[col],') + code('Plan->ncolblk[col] >= (1<<16) ? (Plan->ncolblk[col]-1)/65535+1: 1, 1);') + IF('Plan->ncolblk[col] > 0') + + if reduct or (inc_stage==1 and ind_inc): + if reduct and inc_stage==1: + code('int nshared = MAX(Plan->nshared,reduct_size*nthread);') + elif reduct: + code('int nshared = reduct_size*nthread;') + else: + code('int nshared = Plan->nsharedCol[col];') + code('op_cuda_'+name+'<<>>(') + else: + code('op_cuda_'+name+'<<>>(') + + if nopts > 0: + code('optflags,') + for m in range(1,ninds+1): + g_m = invinds[m-1] + code('( *).data_d,') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('arg'+str(g_m)+'.map_data_d, ') + for g_m in range(0,nargs): + if inds[g_m]==0: + code('(*).data_d,') + + if inc_stage==1 and ind_inc: + code('Plan->ind_map,') + code('Plan->loc_map,') + code('Plan->ind_sizes,') + code('Plan->ind_offs,') + if op_color2: + code('start,') + code('end,') + code('Plan->col_reord,') + else: + code('block_offset,') + code('Plan->blkmap,') + code('Plan->offset,') + code('Plan->nelems,') + code('Plan->nthrcol,') + code('Plan->thrcol,') + code('Plan->ncolblk[col],') + code('set->size+set->exec_size);') + code('') + if reduct: + comm('transfer global reduction data back to CPU') + IF('col == Plan->ncolors_owned-1') + code('mvReductArraysToHost(reduct_bytes);') + ENDIF() + if not op_color2: + ENDFOR() #TODO sztem ez forditva van... + code('block_offset += Plan->ncolblk[col];') + ENDIF() + +# +# +# + elif ninds>0 and atomics: + if reduct: + FOR('round','0','3') + else: + FOR('round','0','2') + IF('round==1') + code('op_mpi_wait_all_grouped(nargs, args, 2);') + #code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + if reduct: + code('int start = round==0 ? 0 : (round==1 ? set->core_size : set->size);') + code('int end = round==0 ? set->core_size : (round==1? set->size : set->size + set->exec_size);') + else: + code('int start = round==0 ? 0 : set->core_size;') + code('int end = round==0 ? set->core_size : set->size + set->exec_size;') + IF('end-start>0') + code('int nblocks = (end-start-1)/nthread+1;') + if reduct: + code('int nshared = reduct_size*nthread;') + code('op_cuda_'+name+'<<>>(') + else: + code('op_cuda_'+name+'<<>>(') + if nopts > 0: + code('optflags,') + for m in range(1,ninds+1): + g_m = invinds[m-1] + code('( *).data_d,') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('arg'+str(g_m)+'.map_data_d, ') + for g_m in range(0,nargs): + if inds[g_m]==0: + code('(*).data_d,') + code('start,end,set->size+set->exec_size);') + ENDIF() + if reduct: + code('if (round==1) mvReductArraysToHost(reduct_bytes);') + + ENDFOR() +# +# kernel call for direct version +# + else: + if reduct: + code('int nshared = reduct_size*nthread;') + code('op_cuda_'+name+'<<>>(') + else: + code('op_cuda_'+name+'<<>>(') + + indent = ' '#*(len(name)+42) + if nopts > 0: + code(indent+'optflags,') + for g_m in range(0,nargs): + if g_m > 0: + code(indent+'( *) .data_d,') + else: + code(indent+'( *) .data_d,') + + code(indent+'set->size );') + + if ninds>0 and not atomics: + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + +# +# transfer global reduction initial data +# + if reduct: + if ninds == 0: + comm('transfer global reduction data back to CPU') + code('mvReductArraysToHost(reduct_bytes);') + + for m in range(0,nargs): + g_m = m + if maps[m]==OP_GBL and accs[m]!=OP_READ and accs[m] != OP_WRITE: + FOR('b','0','maxblocks') + FOR('d','0','') + if accs[m]==OP_INC: + code('h[d] = h[d] + (( *).data)[d+b*];') + elif accs[m]==OP_MIN: + code('h[d] = MIN(h[d],(( *).data)[d+b*]);') + elif accs[m]==OP_MAX: + code('h[d] = MAX(h[d],(( *).data)[d+b*]);') + ENDFOR() + ENDFOR() + + code('.data = (char *)h;') + code('op_mpi_reduce(&,h);') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('') + code('mvConstArraysToHost(consts_bytes);') + break + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + FOR('d','0','') + code('h[d] = (( *).data)[d];') + ENDFOR() + code('.data = (char *)h;') + code('op_mpi_reduce(&,h);') + + ENDIF() + code('op_mpi_set_dirtybit_cuda(nargs, args);') + +# +# update kernel record +# + + code('cutilSafeCall(cudaDeviceSynchronize());') + comm('update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + depth = depth - 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('cuda'): + os.makedirs('cuda') + fid = open('cuda/'+name+'_kernel.cu','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text = '' + + comm('global constants') + + code('#ifndef MAX_CONST_SIZE') + code('#define MAX_CONST_SIZE 128') + code('#endif') + code('') + + for nc in range (0,len(consts)): + if consts[nc]['dim']==1: + code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda;') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + + code('__constant__ '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_cuda['+num+'];') + code('') + + comm('header') + + if os.path.exists('./user_types.h'): + code('#ifndef OP_FUN_PREFIX\n#define OP_FUN_PREFIX __host__ __device__\n#endif') + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h"') + code('#include "op_cuda_rt_support.h"') + code('#include "op_cuda_reduction.h"') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + depth = depth + 2 + code('if (!OP_hybrid_gpu) return;') + if not consts[nc]['dim'] or int(consts[nc]['dim']) > 1: + IF('dim*sizeof('+consts[nc]['type'][1:-1]+')>MAX_CONST_SIZE') + code('printf("error: MAX_CONST_SIZE not big enough\\n"); exit(1);') + ENDIF() + code('cutilSafeCall(cudaMemcpyToSymbol('+consts[nc]['name']+'_cuda, dat, dim*sizeof('+consts[nc]['type'][1:-1]+')));') + depth = depth - 2 + code('}') + + code('') + comm('user kernel files') + + for nk in range(0,len(kernels)): + file_text = file_text +\ + '#include "'+kernels[nk]['name']+'_kernel.cu"\n' + + master = master.split('.')[0] + fid = open('cuda/'+master.split('.')[0]+'_kernels.cu','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/c/op2_gen_cuda_simple_hyb.py b/translator/c/op2_gen_cuda_simple_hyb.py new file mode 100644 index 000000000..c3fbebc2b --- /dev/null +++ b/translator/c/op2_gen_cuda_simple_hyb.py @@ -0,0 +1,208 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cu for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+text.rstrip()+'\n' + + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR_INC(i,start,finish,inc): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'+='+inc+' ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_cuda_simple_hyb(master, date, consts, kernels,sets): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + depth = 0 + FORTRAN = 0 + CPP = 1 + g_m = 0 +########################################################################## +# output one master kernel file +########################################################################## + + file_text = '' + comm('header') + code('#ifdef GPUPASS') + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + code('#define op_par_loop_'+name+' op_par_loop_'+name+'_gpu') + code('#include "'+master.split('.')[0]+'_kernels.cu"') + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + code('#undef op_par_loop_'+name) + code('#else') + code('#define SKIP_DECL_CONST') + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + code('#define op_par_loop_'+name+' op_par_loop_'+name+'_cpu') + code('#include "../openmp/'+master.split('.')[0]+'_kernels.cpp"') + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + code('#undef op_par_loop_'+name) + + code('') + comm('user kernel files') + + for nk in range(0,len(kernels)): + name = kernels[nk]['name'] + unique_args = list(range(1,kernels[nk]['nargs']+1)) + code('') + code('void op_par_loop_'+name+'_gpu(char const *name, op_set set,') + depth += 2 + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg arg'+str(g_m)+');') + else: + code('op_arg arg'+str(g_m)+',') + depth -= 2 + code('') + comm('GPU host stub function') + code('#if OP_HYBRID_GPU') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg arg'+str(g_m)+'){') + code('') + else: + code('op_arg arg'+str(g_m)+',') + + IF('OP_hybrid_gpu') + code('op_par_loop_'+name+'_gpu(name, set,') + depth += 2 + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('arg'+str(g_m)+');') + code('') + else: + code('arg'+str(g_m)+',') + depth -=2 + code('}else{') + code('op_par_loop_'+name+'_cpu(name, set,') + depth += 2 + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('arg'+str(g_m)+');') + code('') + else: + code('arg'+str(g_m)+',') + depth -=2 + ENDIF() + depth-=2 + code('}') + code('#else') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg arg'+str(g_m)+'){') + code('') + else: + code('op_arg arg'+str(g_m)+',') + + + code('op_par_loop_'+name+'_gpu(name, set,') + depth += 2 + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('arg'+str(g_m)+');') + code('') + else: + code('arg'+str(g_m)+',') + depth-=2 + code('}') + depth-=2 + code('#endif //OP_HYBRID_GPU') + code("#endif") + master = master.split('.')[0] + fid = open('cuda/'+master.split('.')[0]+'_hybkernels.cu','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/c/op2_gen_mpi_vec.py b/translator/c/op2_gen_mpi_vec.py new file mode 100644 index 000000000..8dd34d810 --- /dev/null +++ b/translator/c/op2_gen_mpi_vec.py @@ -0,0 +1,811 @@ +########################################################################## +# +# MPI Sequential code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import glob +import os +import op2_gen_common + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR2(i,start,finish,inc): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1, '+inc) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'+='+inc+' ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_mpi_vec(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + grouped = 0 + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) +# +# set three logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP : + j = i + indirect_kernel = j >= 0 + + if nargs != nargs_novec: + return +#################################################################################### +# generate the user kernel function - creating versions for vectorisation as needed +#################################################################################### + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + +# +# First original version +# + comm('user function') + file_name = decl_filepath + + f = open(file_name, 'r') + kernel_text = f.read() + file_text += kernel_text + f.close() + + ## Clang compiler can struggle to vectorize a loop if it uses a mix of + ## Python-generated simd arrays for indirect data AND pointers to direct + ## data. Fix by also generating simd arrays for direct data: + do_gen_direct_simd_arrays = True + +# +# Modified vectorisable version if its an indirect kernel +# - direct kernels can be vectorised without modification +# + if indirect_kernel: + code('#ifdef VECTORIZE') + comm('user function -- modified for vectorisation') + f = open(file_name, 'r') + kernel_text = f.read() + f.close() + + kernel_text = op2_gen_common.comment_remover(kernel_text) + kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) + + p = re.compile('void\\s+\\b'+name+'\\b') + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + + #i = kernel_text[0:i].rfind('\n') #reverse find + j = kernel_text[i:].find('{') + k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') + signature_text = kernel_text[i:i+j] + l = signature_text[0:].find('(') + head_text = signature_text[0:l] #save function name + m = op2_gen_common.para_parse(signature_text, 0, '(', ')') + signature_text = signature_text[l+1:m] + body_text = kernel_text[i+j+1:k] + + ## Replace occurrences of '#include ""' within loop with the contents of : + body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) + + + # check for number of arguments + nargs_actual = len(signature_text.split(',')) + if nargs_actual != nargs: + print(('Error parsing user kernel({0}): must have {1} arguments (instead it has {2})'.format(name, nargs, nargs_actual))) + return + + new_signature_text = '' + for i in range(0,nargs): + var = signature_text.split(',')[i].strip() + + if do_gen_direct_simd_arrays: + do_gen_simd_array_arg = maps[i] != OP_GBL + else: + do_gen_simd_array_arg = maps[i] != OP_GBL and maps[i] != OP_ID + if do_gen_simd_array_arg: + #remove * and add [*][SIMD_VEC] + var = var.replace('*','') + #locate var in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var)) + var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + + #print var2 + + body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) + array_access_pattern = '\[[\w\(\)\+\-\*\s\\\\]*\]' + + ## It has been observed that vectorisation can fail on loops with increments, + ## but replacing them with writes succeeds. + ## For example with Clang on particular loops, vectorisation fails with message: + ## "loop not vectorized: loop control flow is not understood by vectorizer" + ## replacing increments with writes solves this. + ## Replacement is data-safe due to use of local/intermediate SIMD arrays. + ## Hopefully the regex is matching all increments. + ## And for loops that were being vectorised, this change can give a small perf boost. + if maps[i] == OP_MAP and accs[i] == OP_INC: + ## Replace 'var' increments with writes: + body_text = re.sub(r'('+var2+array_access_pattern+'\s*'+')'+re.escape("+="), r'\1'+'=', body_text) + + ## Append vector array access: + body_text = re.sub(r'('+var2+array_access_pattern+')', r'\1'+'[idx]', body_text) + + var = var + '[][SIMD_VEC]' + #var = var + '[restrict][SIMD_VEC]' + new_signature_text += var+', ' + + + #add ( , idx and ) + signature_text = "#if defined __clang__ || defined __GNUC__\n" + signature_text += "__attribute__((always_inline))\n" + signature_text += "#endif\n" + signature_text += "inline " + head_text + '( '+new_signature_text + 'int idx ) {' + #finally update name + signature_text = signature_text.replace(name,name+'_vec') + + #print head_text + #print signature_text + #print body_text + + file_text += signature_text + body_text + '}\n' + code('#endif'); + + + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# create aligned pointers +# + comm('create aligned pointers for dats') + for g_m in range (0,nargs): + if maps[g_m] != OP_GBL: + if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): + code('ALIGNED_ * __restrict__ ptr'+\ + str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + #code('* __restrict__ __attribute__((align_value (_ALIGN))) ptr'+\ + #str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',_ALIGN);') + + else: + code('ALIGNED_ const * __restrict__ ptr'+\ + str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + code('DECLARE_PTR_ALIGNED(ptr'+str(g_m)+',_ALIGN);') + #code('const * __restrict__ __attribute__((align_value (_ALIGN))) ptr'+\ + #str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + + + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + +# +# indirect bits +# + if ninds>0: + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + + code('') + if grouped: + code('int exec_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 1);') + else: + code('int exec_size = op_mpi_halo_exchanges(set, nargs, args);') + + code('') + IF('exec_size >0') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + code('#ifdef VECTORIZE') + + code('#pragma novector') + FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + #initialize globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code(' dat{0}[SIMD_VEC];'.format(g_m)) + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('dat{0}[i] = 0.0;'.format(g_m)) + elif accs[g_m] == OP_MAX: + code('dat{0}[i] = -INFINITY;'.format(g_m)) + elif accs[g_m] == OP_MIN: + code('dat{0}[i] = INFINITY;'.format(g_m)) + elif accs[g_m] == OP_READ: + code('dat{0}[i] = *((*)arg{0}.data);'.format(g_m)) + ENDFOR() + + code('if (ncore_size && n>0 && n % OP_mpi_test_frequency == 0)') + code(' op_mpi_test_all(nargs,args);') + IF('(n+SIMD_VEC >= set->core_size) && (n+SIMD_VEC-set->core_size < SIMD_VEC)') + if grouped: + code('op_mpi_wait_all_grouped(nargs, args, 1);') + else: + code('op_mpi_wait_all(nargs, args);') + ENDIF() + for g_m in range(0,nargs): + if do_gen_direct_simd_arrays: + if (maps[g_m] in [OP_MAP, OP_ID]) and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]): + code('ALIGNED_ dat'+str(g_m)+'[][SIMD_VEC];') + else: + if maps[g_m] == OP_MAP and (accs[g_m] in [OP_READ, OP_RW, OP_WRITE, OP_INC]): + code('ALIGNED_ dat'+str(g_m)+'[][SIMD_VEC];') + + #setup gathers + idx_map_template = "int idx{0}_ = * arg{1}.map_data[(n+i) * arg{1}.map->dim + {2}];" + idx_id_template = "int idx{0}_ = * (n+i);" + code('#pragma omp simd simdlen(SIMD_VEC)') + FOR('i','0','SIMD_VEC') + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] in [OP_READ, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k): + code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m])) + elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID : + code(idx_id_template.format(g_m)) + code('') + + init_dat_template = "dat{0}[{1}][i] = (ptr{0})[idx{0}_ + {1}];" + zero_dat_template = "dat{0}[{1}][i] = 0.0;" + for g_m in range(0,nargs): + if do_gen_direct_simd_arrays: + ## also 'gather' directly-accessed data, because SOME compilers + ## struggle to vectorise otherwise (e.g. Clang). + if maps[g_m] != OP_GBL : + if accs[g_m] in [OP_READ, OP_RW]: + for d in range(0,int(dims[g_m])): + code(init_dat_template.format(g_m, d)) + code('') + elif accs[g_m] == OP_INC: + for d in range(0,int(dims[g_m])): + code(zero_dat_template.format(g_m, d)) + code('') + else: + if maps[g_m] == OP_MAP : + if accs[g_m] in [OP_READ, OP_RW]:#and (not mapinds[g_m] in k): + for d in range(0,int(dims[g_m])): + init_dat_str = init_dat_template.format(g_m, d) + code(init_dat_str) + code('') + elif (accs[g_m] == OP_INC): + for d in range(0,int(dims[g_m])): + zero_dat_str = zero_dat_template.format(g_m, d) + code(zero_dat_str) + code('') + else: #globals + if (accs[g_m] == OP_INC): + # for d in range(0,int(dims[g_m])): + # code('dat'+str(g_m)+'[i] = 0.0;') + # code('') + pass + + ENDFOR() + #kernel call + code('#pragma omp simd simdlen(SIMD_VEC)') + FOR('i','0','SIMD_VEC') + line = name+'_vec(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if (not do_gen_direct_simd_arrays) and maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)],' + elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data,' + elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + line = line + indent +'&dat'+str(g_m)+'[i],' + else: + line = line + indent + 'dat'+str(g_m)+',' + line = line +indent +'i);' + code(line) + ENDFOR() + #do the scatters + FOR('i','0','SIMD_VEC') + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]):#and (not mapinds[g_m] in k): + code(idx_map_template.format(g_m, invmapinds[inds[g_m]-1], idxs[g_m])) + elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID : + if (accs[g_m] in [OP_INC, OP_RW, OP_WRITE]): + code(idx_id_template.format(g_m)) + code('') + dat_scatter_inc_template = "(ptr{0})[idx{0}_ + {1}] += dat{0}[{1}][i];" + dat_scatter_wr_template = "(ptr{0})[idx{0}_ + {1}] = dat{0}[{1}][i];" + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_INC ): + for d in range(0,int(dims[g_m])): + code(dat_scatter_inc_template.format(g_m, d)) + code('') + elif accs[g_m] in [OP_WRITE, OP_RW]: + for d in range(0,int(dims[g_m])): + code(dat_scatter_wr_template.format(g_m, d)) + code('') + elif do_gen_direct_simd_arrays and maps[g_m] == OP_ID: + ## also scatter directly-written data + if (accs[g_m] == OP_INC ): + for d in range(0,int(dims[g_m])): + code(dat_scatter_inc_template.format(g_m, d)) + elif accs[g_m] in [OP_WRITE, OP_RW]: + for d in range(0,int(dims[g_m])): + code(dat_scatter_wr_template.format(g_m, d)) + code('') + ENDFOR() + + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('*(*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];') + elif accs[g_m] == OP_MAX: + code('*(*)arg'+str(g_m)+'.data = MAX(*(*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') + elif accs[g_m] == OP_MIN: + code('*(*)arg'+str(g_m)+'.data = MIN(*(*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') + ENDFOR() + + + ENDFOR() + code('') + comm('remainder') + FOR('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size') + depth = depth -2 + code('#else') + FOR('n','0','exec_size') + depth = depth -2 + code('#endif') + depth = depth +2 + IF('n==set->core_size') + if grouped: + code('op_mpi_wait_all_grouped(nargs, args, 1);') + else: + code('op_mpi_wait_all(nargs, args);') + ENDIF() + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + IF('.opt') + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + +# +# kernel call for direct version +# + else: + code('#ifdef VECTORIZE') + + code('#pragma novector') + FOR2('n','0','(exec_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + + #initialize globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code(' dat{0}[SIMD_VEC];'.format(g_m)) + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('dat{0}[i] = 0.0;'.format(g_m)) + elif accs[g_m] == OP_MAX: + code('dat{0}[i] = -INFINITY;'.format(g_m)) + elif accs[g_m] == OP_MIN: + code('dat{0}[i] = INFINITY;'.format(g_m)) + elif accs[g_m] == OP_READ: + code('dat{0}[i] = *((*)arg{0}.data);'.format(g_m)) + ENDFOR() + + code('#pragma omp simd simdlen(SIMD_VEC)') + FOR('i','0','SIMD_VEC') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)]' + if maps[g_m] == OP_MAP: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + line = line + indent +'&dat'+str(g_m)+'[i]' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('*(*)arg'+str(g_m)+'.data += dat'+str(g_m)+'[i];') + elif accs[g_m] == OP_MAX: + code('*(*)arg'+str(g_m)+'.data = MAX(*(*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') + elif accs[g_m] == OP_MIN: + code('*(*)arg'+str(g_m)+'.data = MIN(*(*)arg'+str(g_m)+'.data,dat'+str(g_m)+'[i]);') + ENDFOR() + ENDFOR() + + comm('remainder') + FOR ('n','(exec_size/SIMD_VEC)*SIMD_VEC','exec_size') + depth = depth -2 + code('#else') + FOR('n','0','exec_size') + depth = depth -2 + code('#endif') + depth = depth +2 + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+'*n]' + if maps[g_m] == OP_GBL: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('exec_size == 0 || exec_size == set->core_size') + if grouped: + code('op_mpi_wait_all_grouped(nargs, args, 1);') + else: + code('op_mpi_wait_all(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + code('op_mpi_reduce(&,('+typs[g_m]+'*).data);') + + code('op_mpi_set_dirtybit(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + else: + names = [] + for g_m in range(0,ninds): + mult='' + if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[invinds[g_m]] in names: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';') + names = names + [var[invinds[g_m]]] + for g_m in range(0,nargs): + mult='' + if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[g_m] in names: + names = names + [var[invinds[g_m]]] + if maps[g_m] == OP_ID: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + elif maps[g_m] == OP_GBL: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;') + + depth -= 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('vec'): + os.makedirs('vec') + fid = open('vec/'+name+'_veckernel.cpp','w') + date = datetime.datetime.now() + #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + code('#define double_ALIGN 128') + code('#define float_ALIGN 64') + code('#define int_ALIGN 64') + code('#ifdef VECTORIZE') + code('#define SIMD_VEC 4') + code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))') + code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))') + code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))') + code(' #ifdef __ICC') + code(' #define DECLARE_PTR_ALIGNED(X, Y) __assume_aligned(X, Y)') + code(' #else') + code(' #define DECLARE_PTR_ALIGNED(X, Y)') + code(' #endif') + code('#else') + code('#define ALIGNED_double') + code('#define ALIGNED_float') + code('#define ALIGNED_int') + code('#define DECLARE_PTR_ALIGNED(X, Y)') + code('#endif') + code('') + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h"') + code('') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + code('}') + code('') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_veckernel.cpp"') + master = master.split('.')[0] + fid = open('vec/'+master.split('.')[0]+'_veckernels.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/c/op2_gen_omp_vec.py b/translator/c/op2_gen_omp_vec.py new file mode 100644 index 000000000..30b740edc --- /dev/null +++ b/translator/c/op2_gen_omp_vec.py @@ -0,0 +1,865 @@ +########################################################################## +# +# MPI Sequential code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import glob + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR2(i,start,finish,inc): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1, '+inc) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'+='+inc+' ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def comment_remover(text): + """Remove comments from text""" + + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return '' + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + +def remove_trailing_w_space(text): + text = text+' ' + line_start = 0 + line = "" + line_end = 0 + striped_test = '' + count = 0 + while 1: + line_end = text.find("\n",line_start+1) + line = text[line_start:line_end] + line = line.rstrip() + striped_test = striped_test + line +'\n' + line_start = line_end + 1 + line = "" + if line_end < 0: + return striped_test[:-1] + +def para_parse(text, j, op_b, cl_b): + """Parsing code block, i.e. text to find the correct closing brace""" + + depth = 0 + loc2 = j + + while 1: + if text[loc2] == op_b: + depth = depth + 1 + + elif text[loc2] == cl_b: + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def op2_gen_omp_vec(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) +# +# set three logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP : + j = i + indirect_kernel = j >= 0 + +#################################################################################### +# generate the user kernel function - creating versions for vectorisation as needed +#################################################################################### + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + +# +# First original version +# + comm('user function') + file_name = decl_filepath + + f = open(file_name, 'r') + kernel_text = f.read() + file_text += kernel_text + f.close() + +# +# Modified vectorisable version if its an indirect kernel +# - direct kernels can be vectorised without modification +# + if indirect_kernel: + if ind_inc: + code('#define VECTORIZE') + code('#ifdef VECTORIZE') + comm('user function -- modified for vectorisation') + f = open(file_name, 'r') + kernel_text = f.read() + f.close() + + kernel_text = comment_remover(kernel_text) + kernel_text = remove_trailing_w_space(kernel_text) + + p = re.compile('void\\s+\\b'+name+'\\b') + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + + #i = kernel_text[0:i].rfind('\n') #reverse find + j = kernel_text[i:].find('{') + k = para_parse(kernel_text, i+j, '{', '}') + signature_text = kernel_text[i:i+j] + l = signature_text[0:].find('(') + head_text = signature_text[0:l] #save function name + m = para_parse(signature_text, 0, '(', ')') + signature_text = signature_text[l+1:m] + body_text = kernel_text[i+j+1:k] + + + # check for number of arguments + if len(signature_text.split(',')) != nargs: + print('Error parsing user kernel(%s): must have %d arguments' \ + % name, nargs) + return + + new_signature_text = '' + for i in range(0,nargs): + var = signature_text.split(',')[i].strip() + + if maps[i] != OP_GBL and maps[i] != OP_ID: + #remove * and add [*][SIMD_VEC] + var = var.replace('*','') + #locate var in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var)) + var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + + #print var2 + + body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) + body_text = re.sub(r'('+var2+'\[[A-Za-z0-9]*\]'+')', r'\1'+'[idx]', body_text) + + + var = var + '[*][SIMD_VEC]' + #var = var + '[restrict][SIMD_VEC]' + new_signature_text += var+', ' + + + #add ( , idx and ) + signature_text = head_text + '( '+new_signature_text + 'int idx ) {' + #finally update name + signature_text = signature_text.replace(name,name+'_vec') + + #print head_text + #print signature_text + #print body_text + + file_text += signature_text + body_text + '}\n' + code('#endif'); + + + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# create aligned pointers +# + reduce_clauses = '' + aligned_clauses = '' + comm('create aligned pointers for dats') + for g_m in range (0,nargs): + if maps[g_m] != OP_GBL: + if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): + code('ALIGNED_ * __restrict__ ptr'+\ + str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + #code('* __restrict__ __attribute__((align_value (_ALIGN))) ptr'+\ + #str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + code('__assume_aligned(ptr'+str(g_m)+',_ALIGN);') + aligned_clauses = aligned_clauses + 'ptr'+str(g_m)+',' + + else: + code('ALIGNED_ const * __restrict__ ptr'+\ + str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + code('__assume_aligned(ptr'+str(g_m)+',_ALIGN);') + aligned_clauses = aligned_clauses + 'ptr'+str(g_m)+',' + #code('const * __restrict__ __attribute__((align_value (_ALIGN))) ptr'+\ + #str(g_m)+' = ( *) arg'+str(g_m)+'.data;') + elif accs[g_m]==OP_MIN or accs[g_m]==OP_MAX or accs[g_m]==OP_INC: + if not dims[g_m].isdigit() or int(dims[g_m])>1: + print('Error reduce dim < 1') + exit(2) + code(' h = *( *)arg'+str(g_m)+'.data;') + if accs[g_m]==OP_MIN: + reduce_clauses = reduce_clauses + 'reduction(min:arg'+str(g_m)+'h) ' + elif accs[g_m]==OP_MAX: + reduce_clauses = reduce_clauses + 'reduction(max:arg'+str(g_m)+'h) ' + elif accs[g_m]==OP_INC: + reduce_clauses = reduce_clauses + 'reduction(+:arg'+str(g_m)+'h) ' + + aligned_clauses = aligned_clauses[:-1] + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + +# +# indirect bits +# + if ninds>0: + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + code('') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + + code('') + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + + code('') + IF('set_size >0') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + comm(' get plan') + code('op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);') + + code('') + + + #colored loop + comm(' execute plan') + code('int block_offset = 0;') + FOR('col','0','Plan->ncolors') + IF('col==Plan->ncolors_core') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + code('int nblocks = Plan->ncolblk[col];') + code('') + code('#pragma omp parallel for '+reduce_clauses) + FOR('blockIdx','0','nblocks') + code('int blockId = Plan->blkmap[blockIdx + block_offset];') + code('int nelem = Plan->nelems[blockId];') + code('int offset_b = Plan->offset[blockId];') + + code('#ifdef VECTORIZE') + + #initialze globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code(' dat{0}[SIMD_VEC];'.format(g_m)) + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('dat{0}[i] = 0.0;'.format(g_m)) + elif accs[g_m] == OP_MAX: + code('dat{0}[i] = -INFINITY;'.format(g_m)) + elif accs[g_m] == OP_MIN: + code('dat{0}[i] = INFINITY;'.format(g_m)) + elif accs[g_m] == OP_READ: + code('dat{0}[i] = *((*)arg{0}.data);'.format(g_m)) + ENDFOR() + + comm('peel left remainder') + FOR('n','offset_b','((offset_b-1)/SIMD_VEC+1)*SIMD_VEC') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + IF('.opt') + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + else: + line = line + indent + '&arg'+str(g_m)+'h' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + + + + code('#pragma novector') + FOR2('n','((offset_b-1)/SIMD_VEC+1)*SIMD_VEC','((offset_b+nelem)/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + IF('n+SIMD_VEC >= set->core_size') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \ + or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \ + or accs[g_m] == OP_INC): + code('ALIGNED_ dat'+str(g_m)+'[][SIMD_VEC];') + + #setup gathers + code('#pragma omp simd simdlen(SIMD_VEC) aligned('+aligned_clauses+')') + FOR('i','0','SIMD_VEC') + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE):#and (not mapinds[g_m] in k): + code('int idx'+str(g_m)+'_ = * arg'+str(invmapinds[inds[g_m]-1])+'.map_data[(n+i) * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW):#and (not mapinds[g_m] in k): + for d in range(0,int(dims[g_m])): + code('dat'+str(g_m)+'['+str(d)+'][i] = (ptr'+str(g_m)+')[idx'+str(g_m)+'_ + '+str(d)+'];') + code('') + elif (accs[g_m] == OP_INC): + for d in range(0,int(dims[g_m])): + code('dat'+str(g_m)+'['+str(d)+'][i] = 0.0;') + code('') + else: #globals + if (accs[g_m] == OP_INC): + for d in range(0,int(dims[g_m])): + code('dat'+str(g_m)+'[i] = 0.0;') + code('') + + ENDFOR() + #kernel call + code('#pragma omp simd simdlen(SIMD_VEC) aligned('+aligned_clauses+')') + FOR('i','0','SIMD_VEC') + line = name+'_vec(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)],' + elif maps[g_m] == OP_GBL and accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data,' + elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + line = line + indent +'&dat'+str(g_m)+'[i],' + else: + line = line + indent + 'dat'+str(g_m)+',' + line = line +indent +'i);' + code(line) + ENDFOR() + #do the scatters + FOR('i','0','SIMD_VEC') + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE):#and (not mapinds[g_m] in k): + code('int idx'+str(g_m)+'_ = * arg'+str(invmapinds[inds[g_m]-1])+'.map_data[(n+i) * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_INC ): + for d in range(0,int(dims[g_m])): + code('(ptr'+str(g_m)+')[idx'+str(g_m)+'_ + '+str(d)+'] += dat'+str(g_m)+'['+str(d)+'][i];') + code('') + if (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW): + for d in range(0,int(dims[g_m])): + code('(ptr'+str(g_m)+')[idx'+str(g_m)+'_ + '+str(d)+'] = dat'+str(g_m)+'['+str(d)+'][i];') + code('') + ENDFOR() + + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('h += dat'+str(g_m)+'[i];') + elif accs[g_m] == OP_MAX: + code('h = MAX(h,dat'+str(g_m)+'[i]);') + elif accs[g_m] == OP_MIN: + code('h = MIN(h,dat'+str(g_m)+'[i]);') + ENDFOR() + + + ENDFOR() + code('') + comm('remainder') + FOR('n','((offset_b+nelem)/SIMD_VEC)*SIMD_VEC','offset_b+nelem') + depth = depth -2 + code('#else') + if not ind_inc: + code('#pragma omp simd simdlen(SIMD_VEC) aligned('+aligned_clauses+') '+reduce_clauses) + FOR('n','offset_b','offset_b+nelem') + depth = depth -2 + code('#endif') + depth = depth +2 + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + code('') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + else: + line = line + indent + '&arg'+str(g_m)+'h' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + ENDFOR() #REDUCTIONS + code('block_offset += nblocks;'); + ENDFOR() #Block colors + +# +# kernel call for direct version +# + else: + code('#ifdef VECTORIZE') + + #initialize globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code(' dat{0}[SIMD_VEC];'.format(g_m)) + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('dat{0}[i] = 0.0;'.format(g_m)) + elif accs[g_m] == OP_MAX: + code('dat{0}[i] = -INFINITY;'.format(g_m)) + elif accs[g_m] == OP_MIN: + code('dat{0}[i] = INFINITY;'.format(g_m)) + elif accs[g_m] == OP_READ: + code('dat{0}[i] = *((*)arg{0}.data);'.format(g_m)) + ENDFOR() + + code('#pragma novector') + code('#pragma omp parallel for '+reduce_clauses) + FOR2('n','0','(set_size/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + + code('#pragma simdlen(SIMD_VEC) aligned('+aligned_clauses+')') + FOR('i','0','SIMD_VEC') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * (n+i)]' + if maps[g_m] == OP_MAP: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + else: + line = line + indent +'&dat'+str(g_m)+'[i]' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + FOR('i','0','SIMD_VEC') + if accs[g_m] == OP_INC: + code('arg'+str(g_m)+'h += dat'+str(g_m)+'[i];') + elif accs[g_m] == OP_MAX: + code('arg'+str(g_m)+'h = MAX(arg'+str(g_m)+'h,dat'+str(g_m)+'[i]);') + elif accs[g_m] == OP_MIN: + code('arg'+str(g_m)+'h = MIN(arg'+str(g_m)+'h,dat'+str(g_m)+'[i]);') + ENDFOR() + ENDFOR() + + comm('remainder') + FOR ('n','(set_size/SIMD_VEC)*SIMD_VEC','set_size') + depth = depth -2 + code('#else') + code('#pragma omp parallel for simd aligned('+aligned_clauses+') '+reduce_clauses) + FOR('n','0','set_size') + depth = depth -2 + code('#endif') + depth = depth +2 + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(ptr'+str(g_m)+')['+str(dims[g_m])+'*n]' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + else: + line = line + indent + '&arg'+str(g_m)+'h' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('set_size == 0 || set_size == set->core_size') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + code('*(*).data = h;') + code('op_mpi_reduce(&,('+typs[g_m]+'*).data);') + + code('op_mpi_set_dirtybit(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + else: + names = [] + for g_m in range(0,ninds): + mult='' + if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[invinds[g_m]] in names: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';') + names = names + [var[invinds[g_m]]] + for g_m in range(0,nargs): + mult='' + if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[g_m] in names: + names = names + [var[invinds[g_m]]] + if maps[g_m] == OP_ID: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + elif maps[g_m] == OP_GBL: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;') + + depth -= 2 + code('}') + code('#undef VECTORIZE') + + +########################################################################## +# output individual kernel file +########################################################################## + fid = open(name+'_ompveckernel.cpp','w') + date = datetime.datetime.now() + #fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + code('#define double_ALIGN 128') + code('#define float_ALIGN 64') + code('#define int_ALIGN 64') + code('#define VECTORIZE') + code('#ifdef VECTORIZE') + code('#define SIMD_VEC 4') + code('#define ALIGNED_double __attribute__((aligned(double_ALIGN)))') + code('#define ALIGNED_float __attribute__((aligned(float_ALIGN)))') + code('#define ALIGNED_int __attribute__((aligned(int_ALIGN)))') + code('#else') + code('#define ALIGNED_double') + code('#define ALIGNED_float') + code('#define ALIGNED_int') + code('#endif') + code('#undef VECTORIZE') + code('') + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'] > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + code('#include "op_lib_cpp.h"') + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h"') + code('') + + code('#ifndef SKIP_DECL_CONST') + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + code('}') + code('') + code('#endif') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_ompveckernel.cpp"') + master = master.split('.')[0] + fid = open(master.split('.')[0]+'_ompveckernels.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/c/op2_gen_openacc.py b/translator/c/op2_gen_openacc.py new file mode 100644 index 000000000..af8aef626 --- /dev/null +++ b/translator/c/op2_gen_openacc.py @@ -0,0 +1,697 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import glob +import datetime +import op2_gen_common +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_openacc(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ and accs[i] != OP_WRITE: + j = i + reduct = j >= 0 + +########################################################################## +# start with the user kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + file_name = decl_filepath + f = open(file_name, 'r') + kernel_text = f.read() + f.close() + + comm('user function') + + if CPP: + includes = op2_gen_common.extract_includes(kernel_text) + if len(includes) > 0: + for include in includes: + code(include) + code("") + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') + code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('int direct_'+name+'_stride_OP2CONSTANT;') + code('int direct_'+name+'_stride_OP2HOST=-1;') + dir_soa = g_m + break + + comm('user function') + + kernel_text = op2_gen_common.comment_remover(kernel_text) + kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) + + p = re.compile('void\\s+\\b'+name+'\\b') + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + + #i = kernel_text[0:i].rfind('\n') #reverse find + j = kernel_text[i:].find('{') + k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') + signature_text = kernel_text[i:i+j] + l = signature_text[0:].find('(') + head_text = signature_text[0:l].strip() #save function name + m = op2_gen_common.para_parse(signature_text, 0, '(', ')') + signature_text = signature_text[l+1:m] + body_text = kernel_text[i+j+1:k] + + # check for number of arguments + if len(signature_text.split(',')) != nargs_novec: + print('Error parsing user kernel('+name+'): must have '+str(nargs_novec)+' arguments') + return + + for i in range(0,nargs_novec): + var = signature_text.split(',')[i].strip() + if kernels[nk]['soaflags'][i]: + var = var.replace('*','') + #locate var in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var)) + var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + + if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: + body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ + op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + else: + body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) + body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ + op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + + head_text += "_openacc" + signature_text = '//#pragma acc routine\ninline ' + head_text + '( '+signature_text + ') {' + file_text += signature_text + body_text + '}\n' + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + + if nopts>0: + code('int optflags = 0;') + for i in range(0,nargs): + if optflags[i] == 1: + IF('args['+str(i)+'].opt') + code('optflags |= 1<<'+str(optidxs[i])+';') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('') + +# +# indirect bits +# + if ninds>0: + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + + code('') + comm(' get plan') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + + code('') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ: + if not dims[g_m].isdigit() or int(dims[g_m]) > 1: + print('ERROR: OpenACC does not support multi-dimensional op_arg_gbl variables') + exit(-1) + code(' _l = h[0];') + + if ninds > 0: + code('') + code('int ncolors = 0;') + code('') + IF('set_size >0') + code('') + #managing constants + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;') + ENDIF() + if dir_soa!=-1: + IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') + code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') + code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;') + ENDIF() + code('') + comm('Set up typed device pointers for OpenACC') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): + k = k + [invmapinds[inds[g_m]-1]] + code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;') + + for m in range(1,ninds+1): + g_m = invinds[m-1] + code(' *data'+str(g_m)+' = ( *).data_d;') + +# +# kernel call for indirect version +# + if ninds>0: + code('') + code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') + code('ncolors = Plan->ncolors;') + code('int *col_reord = Plan->col_reord;') + code('int set_size1 = set->size + set->exec_size;') + code('') + comm(' execute plan') + FOR('col','0','Plan->ncolors') + IF('col==1') + code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + code('int start = Plan->col_offsets[0][col];') + code('int end = Plan->col_offsets[0][col+1];') + code('') +# code('#pragma omp parallel for') + line = '#pragma acc parallel loop independent deviceptr(col_reord,' + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): + k = k + [invmapinds[inds[g_m]-1]] + line = line + 'map'+str(mapinds[g_m])+',' + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line+'data'+str(g_m)+',' + for m in range(1,ninds+1): + g_m = invinds[m-1] + line = line + 'data'+str(g_m)+',' + line = line[:-1]+')' + + if reduct: + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE: + if accs[g_m] == OP_INC: + line = line + ' reduction(+:arg'+str(g_m)+'_l)' + if accs[g_m] == OP_MIN: + line = line + ' reduction(min:arg'+str(g_m)+'_l)' + if accs[g_m] == OP_MAX: + line = line + ' reduction(max:arg'+str(g_m)+'_l)' + code(line) + FOR('e','start','end') + code('int n = col_reord[e];') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ + '[n + set_size1 * '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + IF('optflags & 1<<'+str(optidxs[g_m])) + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ + '[n + set_size1 * '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + if soaflags[g_m]: + line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n' + else: + line = line + indent + ' &data'+str(first)+'[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) + code('') + line = name+'_openacc(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if soaflags[g_m]: + line = line + indent + '&data'+str(g_m)+'[n]' + else: + line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + if vectorised[g_m]: + if g_m+1 in unique_args: + line = line + indent + 'arg'+str(g_m)+'_vec' + else: + if soaflags[g_m]: + line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' + else: + line = line + indent + '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + line = line + indent +'&arg'+str(g_m)+'_l' + if g_m < nargs-1: + if g_m+1 in unique_args and not g_m+1 == unique_args[-1]: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + code('') + + if reduct: + comm(' combine reduction data') + IF('col == Plan->ncolors_owned-1') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: + code('h[0] = _l;') + elif accs[g_m]==OP_MIN: + code('h[0] = MIN(h[0],_l);') + ENDFOR() + elif accs[g_m]==OP_MAX: + code('h[0] = MAX(h[0],_l);') + else: + error('internal error: invalid reduction option') + ENDFOR() + ENDIF() + ENDFOR() + +# +# kernel call for direct version +# + else: + line = '#pragma acc parallel loop independent deviceptr(' + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line+'data'+str(g_m)+',' + line = line[:-1]+')' + + if reduct: + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + if accs[g_m] == OP_INC: + line = line + ' reduction(+:arg'+str(g_m)+'_l)' + if accs[g_m] == OP_MIN: + line = line + ' reduction(min:arg'+str(g_m)+'_l)' + if accs[g_m] == OP_MAX: + line = line + ' reduction(max:arg'+str(g_m)+'_l)' + code(line) + FOR('n','0','set->size') + line = name+'_openacc(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if soaflags[g_m]: + line = line + indent + '&data'+str(g_m)+'[n]' + else: + line = line + indent + '&data'+str(g_m)+'['+str(dims[g_m])+'*n]' + if maps[g_m] == OP_GBL: + line = line + indent +'&arg'+str(g_m)+'_l' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + + if ninds>0: + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('set_size == 0 || set_size == set->core_size || ncolors == 1') + code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + if ninds==0: #direct version only + if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: + code('h[0] = _l;') + elif accs[g_m]==OP_MIN: + code('h[0] = MIN(h[0],_l);') + elif accs[g_m]==OP_MAX: + code('h[0] = MAX(h[0],_l);') + else: + print('internal error: invalid reduction option') + if typs[g_m] == 'double': #need for both direct and indirect + code('op_mpi_reduce_double(&,h);') + elif typs[g_m] == 'float': + code('op_mpi_reduce_float(&,h);') + elif typs[g_m] == 'int': + code('op_mpi_reduce_int(&,h);') + else: + print('Type '+typs[g_m]+' not supported in OpenACC code generator, please add it') + exit(-1) + + + code('op_mpi_set_dirtybit_cuda(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + depth -= 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('openacc'): + os.makedirs('openacc') + fid = open('openacc/'+name+'_acckernel.c','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_c.h" ') + code('') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + code('}') + code('') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_acckernel.c"') + master = master.split('.')[0] + fid = open('openacc/'+master.split('.')[0]+'_acckernels.c','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + + + + diff --git a/translator/c/op2_gen_openmp.py b/translator/c/op2_gen_openmp.py new file mode 100644 index 000000000..01510f007 --- /dev/null +++ b/translator/c/op2_gen_openmp.py @@ -0,0 +1,787 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import op2_gen_common + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + prefix = ' '*depth + file_text += prefix+rep(text,g_m)+'\n' + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + + +def op2_gen_openmp(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ and accs[i] != OP_WRITE: + j = i + reduct = j >= 0 + print(name, reduct) +########################################################################## +# start with OpenMP kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + comm('user function') + + if FORTRAN: + code('include '+name+'.inc') + elif CPP: + code('#include "../'+decl_filepath+'"') + + comm('') + comm(' x86 kernel function') + + if FORTRAN: + code('subroutine op_x86_'+name+'(') + elif CPP: + code('void op_x86_'+name+'(') + + depth = 2 + + if ninds>0: + if FORTRAN: + code('integer(4) blockIdx,') + elif CPP: + code('int blockIdx,') + + for g_m in range(0,ninds): + if FORTRAN: + code(' *ind_,') + elif CPP: + code(' *ind_,') + + if ninds>0: + if FORTRAN: + code('int *ind_map,') + code('short *arg_map,') + elif CPP: + code('int *ind_map,') + code('short *arg_map,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL and accs[g_m] == OP_READ: + # declared const for performance + if FORTRAN: + code('const *,') + elif CPP: + code('const *,') + elif maps[g_m]==OP_ID and ninds>0: + if FORTRAN: + code(',') + elif CPP: + code(' *,') + elif maps[g_m]==OP_GBL or maps[g_m]==OP_ID: + if FORTRAN: + code(',') + elif CPP: + code(' *,') + + if ninds>0: + if FORTRAN: + code('ind_arg_sizes,') + code('ind_arg_offs, ') + code('block_offset, ') + code('blkmap, ') + code('offset, ') + code('nelems, ') + code('ncolors, ') + code('colors, ') + code('set_size) { ') + code('') + elif CPP: + code('int *ind_arg_sizes,') + code('int *ind_arg_offs, ') + code('int block_offset, ') + code('int *blkmap, ') + code('int *offset, ') + code('int *nelems, ') + code('int *ncolors, ') + code('int *colors, ') + code('int set_size) { ') + code('') + else: + if FORTRAN: + code('start, finish )') + elif CPP: + code('int start, int finish ) {') + code('') + + for g_m in range (0,nargs): + if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + code(' _l[];') + + for m in range (1,ninds+1): + g_m = m-1 + v = [int(inds[i]==m) for i in range(len(inds))] + v_i = [vectorised[i] for i in range(len(inds)) if inds[i] == m] + if sum(v)>1 and sum(v_i)>0: + if indaccs[m-1] == OP_INC: + ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 + code(' *_vec['+str(ind)+'] = {'); depth += 2; + for n in range(0,nargs): + if inds[n] == m: + g_m = n + code('_l,') + depth -= 2 + code('};') + else: + ind = int(max([idxs[i] for i in range(len(inds)) if inds[i]==m])) + 1 + if indaccs[m-1] == OP_READ: + code('const *_vec['+str(ind)+'];') + else: + code(' *_vec['+str(ind)+'];') +# +# lengthy code for general case with indirection +# + if ninds>0: + code('') + for g_m in range (0,ninds): + code('int *ind__map, ind__size;') + for g_m in range (0,ninds): + code(' *ind__s;') + + if FORTRAN: + code('integer(4) :: nelem, offset_b, blockId') + code('character :: shared[64000]') + elif CPP: + code('int nelem, offset_b;') + code('') + code('char shared[128000];') + + code('') + IF('0==0') + code('') + comm(' get sizes and shift pointers and direct-mapped data') + code('') + code('int blockId = blkmap[blockIdx + block_offset];') + code('nelem = nelems[blockId];') + code('offset_b = offset[blockId];') + code('') + + for g_m in range (0,ninds): + code('ind__size = ind_arg_sizes['+str(g_m)+'+blockId*'+ str(ninds)+'];') + code('') + for m in range (1,ninds+1): + g_m = m-1 + c = [i for i in range(len(inds)) if inds[i]==m] + code('ind__map = &ind_map['+str(cumulative_indirect_index[c[0]])+\ + '*set_size] + ind_arg_offs['+str(m-1)+'+blockId*'+str(ninds)+'];') + + code('') + comm(' set shared memory pointers') + code('int nbytes = 0;') + + for g_m in range(0,ninds): + code('ind__s = ( *) &shared[nbytes];') + if g_m < ninds-1: + code('nbytes += ROUND_UP(ind__size*sizeof()*);') + ENDIF() + code('') + comm(' copy indirect datasets into shared memory or zero increment') + code('') + + for g_m in range(0,ninds): + if indaccs[g_m]==OP_READ or indaccs[g_m]==OP_RW or indaccs[g_m]==OP_INC: + FOR('n','0','_size') + FOR('d','0','') + if indaccs[g_m]==OP_READ or indaccs[g_m]==OP_RW: + code('_s[d+n*] = [d+_map[n]*];') + code('') + elif indaccs[g_m]==OP_INC: + code('_s[d+n*] = ZERO_;') + ENDFOR() + ENDFOR() + + code('') + comm(' process set elements') + code('') + + if ind_inc: + FOR('n','0','nelem') + comm(' initialise local variables ') + for g_m in range(0,nargs): + if maps[g_m]==OP_MAP and accs[g_m]==OP_INC: + FOR('d','0','') + code('_l[d] = ZERO_;') + ENDFOR() + else: + FOR('n','0','nelem') + +# +# simple alternative when no indirection +# + else: + comm(' process set elements') + FOR('n','start','finish') + +# +# kernel call +# + # xxx: array of pointers for non-locals + for m in range(1,ninds+1): + s = [i for i in range(len(inds)) if inds[i]==m] + if sum(s)>1: + if indaccs[m-1] != OP_INC: + code('') + ctr = 0 + for n in range(0,nargs): + if inds[n] == m and vectorised[n]: + code('arg'+str(m-1)+'_vec['+str(ctr)+'] = ind_arg'+\ + str(inds[n]-1)+'_s+arg_map['+str(cumulative_indirect_index[n])+\ + '*set_size+n+offset_b]*'+str(dims[n])+';') + ctr = ctr+1 + + code('') + comm(' user-supplied kernel call') + + line = name+'(' + prefix = ' '*len(name) + a = 0 #only apply indentation if its not the 0th argument + indent ='' + for m in range (0, nargs): + if a > 0: + indent = ' '+' '*len(name) + + if maps[m] == OP_GBL: + line += rep(indent+',\n',m) + a = a+1 + elif maps[m]==OP_MAP and accs[m]==OP_INC and vectorised[m]==0: + line += rep(indent+'_l,\n',m); + a = a+1 + elif maps[m]==OP_MAP and vectorised[m]==0: + line += rep(indent+'ind_arg'+str(inds[m]-1)+'_s+arg_map['+\ + str(cumulative_indirect_index[m])+'*set_size+n+offset_b]*,\n',m) + a = a+1 + elif maps[m]==OP_MAP and m == 0: + line += rep(indent+'_vec,'+'\n',inds[m]-1) + a = a+1 + elif maps[m]==OP_MAP and m>0 and vectorised[m] != vectorised[m-1]: #xxx:vector + line += rep(indent+'_vec,'+'\n',inds[m]-1) + a = a+1 + elif maps[m]==OP_MAP and m>0 and vectorised[m] == vectorised[m-1]: + line = line + a = a+1 + elif maps[m]==OP_ID: + if ninds>0: + line += rep(indent+'+(n+offset_b)*,'+'\n',m) + a = a+1 + else: + line += rep(indent+'+n*,'+'\n',m) + a = a+1 + else: + print('internal error 1 ') + + code(line[0:-2]+');') #remove final ',' and \n + +# +# updating for indirect kernels ... +# + if ninds>0: + if ind_inc: + code('') + comm(' store local variables ') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + code('int _map = arg_map['+ str(cumulative_indirect_index[g_m])+\ + '*set_size+n+offset_b];') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + FOR('d','0','') + code('ind_arg'+str(inds[g_m]-1)+'_s[d+_map*] += _l[d];') + ENDFOR() + + ENDFOR() + + s = [i for i in range(1,ninds+1) if indaccs[i-1]!= OP_READ] + + if len(s)>0 and max(s)>0: + code('') + comm(' apply pointered write/increment') + + for g_m in range(0,ninds): + if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW or indaccs[g_m]==OP_INC: + FOR('n','0','_size') + FOR('d','0','') + if indaccs[g_m]==OP_WRITE or indaccs[g_m]==OP_RW: + code('[d+_map[n]*] = _s[d+n*];') + elif indaccs[g_m]==OP_INC: + code('[d+_map[n]*] += _s[d+n*];') + ENDFOR() + ENDFOR() +# +# ... and direct kernels +# + else: + depth -= 2 + code('}') + +# +# global reduction +# + depth -= 2 + code('}') + code('') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function ') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL and accs[g_m] != OP_READ: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# indirect bits +# + if ninds>0: + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + + code('') + comm(' get plan') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + +# +# set number of threads in x86 execution and create arrays for reduction +# + + if reduct or ninds==0: + comm(' set number of threads') + code('#ifdef _OPENMP') + code(' int nthreads = omp_get_max_threads();') + code('#else') + code(' int nthreads = 1;') + code('#endif') + + if reduct: + code('') + comm(' allocate and initialise arrays for global reduction') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE: + code(' _l[+64*64];') + FOR('thr','0','nthreads') + if accs[g_m]==OP_INC: + FOR('d','0','') + code('_l[d+thr*64]=ZERO_;') + ENDFOR() + else: + FOR('d','0','') + code('_l[d+thr*64]=h[d];') + ENDFOR() + ENDFOR() + + code('') + IF('set_size >0') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + code('op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);') + code('') + comm(' execute plan') + code('int block_offset = 0;') + FOR('col','0','Plan->ncolors') + IF('col==Plan->ncolors_core') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + code('int nblocks = Plan->ncolblk[col];') + code('') + code('#pragma omp parallel for') + FOR('blockIdx','0','nblocks') + code('op_x86_'+name+'( blockIdx,') + + for m in range(1,ninds+1): + g_m = invinds[m-1] + code('( *).data,') + + code('Plan->ind_map,') + code('Plan->loc_map,') + + for m in range(0,nargs): + g_m = m + if inds[m]==0 and maps[m] == OP_GBL and accs[m] != OP_READ and accs[m] != OP_WRITE: + code('&_l[64*omp_get_thread_num()],') + elif inds[m]==0: + code('( *).data,') + + code('Plan->ind_sizes,') + code('Plan->ind_offs,') + code('block_offset,') + code('Plan->blkmap,') + code('Plan->offset,') + code('Plan->nelems,') + code('Plan->nthrcol,') + code('Plan->thrcol,') + code('set_size);') + ENDFOR() + code('') + + if reduct: + comm(' combine reduction data') + IF('col == Plan->ncolors_owned-1') + for m in range(0,nargs): + if maps[m] == OP_GBL and accs[m] != OP_READ and accs[m] != OP_WRITE: + FOR('thr','0','nthreads') + if accs[m]==OP_INC: + FOR('d','0','') + code('h[d] += _l[d+thr*64];') + ENDFOR() + elif accs[m]==OP_MIN: + FOR('d','0','') + code('h[d] = MIN(h[d],_l[d+thr*64]);') + ENDFOR() + elif accs(m)==OP_MAX: + FOR('d','0','') + code('h[d] = MAX(h[d],_l[d+thr*64]);') + ENDFOR() + else: + error('internal error: invalid reduction option') + ENDFOR() + ENDIF() + code('block_offset += nblocks;'); + ENDIF() + +# +# kernel call for direct version +# + else: + comm(' execute plan') + code('#pragma omp parallel for') + FOR('thr','0','nthreads') + code('int start = (set->size* thr)/nthreads;') + code('int finish = (set->size*(thr+1))/nthreads;') + code('op_x86_'+name+'(') + + for g_m in range(0,nargs): + indent = '' + if maps[g_m]==OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + code(indent+'_l + thr*64,') + else: + code(indent+'( *) .data,') + + code('start, finish );') + ENDFOR() + + if ninds>0: + code('op_timing_realloc('+str(nk)+');') + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer; ') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + ENDIF() + code('') + +# +# combine reduction data from multiple OpenMP threads, direct version +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE and ninds==0: + FOR('thr','0','nthreads') + if accs[g_m]==OP_INC: + FOR('d','0','') + code('h[d] += _l[d+thr*64];') + ENDFOR() + elif accs[g_m]==OP_MIN: + FOR('d','0','') + code('h[d] = MIN(h[d],_l[d+thr*64]);') + ENDFOR() + elif accs[g_m]==OP_MAX: + FOR('d','0','') + code('h[d] = MAX(h[d],_l[d+thr*64]);') + ENDFOR() + else: + print('internal error: invalid reduction option') + ENDFOR() + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + code('op_mpi_reduce(&,h);') + + code('op_mpi_set_dirtybit(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('op_timing_realloc('+str(nk)+');') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + depth -= 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('openmp'): + os.makedirs('openmp') + fid = open('openmp/'+name+'_kernel.cpp','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'] > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h" ') + code('') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_kernel.cpp"') + master = master.split('.')[0] + fid = open('openmp/'+master.split('.')[0]+'_kernels.cpp','w') + fid.write('//\n// auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n//\n\n') + fid.write(file_text) + fid.close() + + + diff --git a/translator/c/op2_gen_openmp4.py b/translator/c/op2_gen_openmp4.py new file mode 100644 index 000000000..ce9e4f17a --- /dev/null +++ b/translator/c/op2_gen_openmp4.py @@ -0,0 +1,898 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import glob +import datetime +import op2_gen_common +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + +def op2_gen_openmp4(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + op2_compiler = os.getenv('OP2_COMPILER','0'); + any_soa = 0 + maptype = 'map' + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + +########################################################################## +# start with the user kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + comm('user function') + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT;') + code('int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST=-1;') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('int direct_'+name+'_stride_OP2CONSTANT;') + code('int direct_'+name+'_stride_OP2HOST=-1;') + dir_soa = g_m + break + + comm('user function') + file_name = decl_filepath + + f = open(file_name, 'r') + kernel_text = f.read() + f.close() + + kernel_text = op2_gen_common.comment_remover(kernel_text) + kernel_text = op2_gen_common.remove_trailing_w_space(kernel_text) + + p = re.compile('void\\s+\\b'+name+'\\b') + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + + #i = kernel_text[0:i].rfind('\n') #reverse find + j = kernel_text[i:].find('{') + k = op2_gen_common.para_parse(kernel_text, i+j, '{', '}') + signature_text = kernel_text[i:i+j] + l = signature_text[0:].find('(') + head_text = signature_text[0:l] #save function name + m = op2_gen_common.para_parse(signature_text, 0, '(', ')') + signature_text = signature_text[l+1:m] + body_text = kernel_text[i+j+1:k] + + ## Replace occurrences of '#include ""' within loop with the contents of : + body_text = op2_gen_common.replace_local_includes_with_file_contents(body_text, os.path.dirname(master)) + + # check for number of arguments + if len(signature_text.split(',')) != nargs_novec: + print('Error parsing user kernel(%s): must have %d arguments' \ + % name, nargs) + return + + for i in range(0,nargs_novec): + var = signature_text.split(',')[i].strip() + if kernels[nk]['soaflags'][i]: + var = var.replace('*','') + #locate var in body and replace by adding [idx] + length = len(re.compile('\\s+\\b').split(var)) + var2 = re.compile('\\s+\\b').split(var)[length-1].strip() + + if int(kernels[nk]['idxs'][i]) < 0 and kernels[nk]['maps'][i] == OP_MAP: + body_text = re.sub(r'\b'+var2+'(\[[^\]]\])\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'\1[(\2)*'+ \ + op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + else: + body_text = re.sub('\*\\b'+var2+'\\b\\s*(?!\[)', var2+'[0]', body_text) + body_text = re.sub(r'\b'+var2+'\[([\\s\+\*A-Za-z0-9]*)\]'+'', var2+r'[(\1)*'+ \ + op2_gen_common.get_stride_string(unique_args[i]-1,maps,mapnames,name)+']', body_text) + + for nc in range(0,len(consts)): + varname = consts[nc]['name'] + body_text = re.sub('\\b'+varname+'\\b', varname+'_ompkernel',body_text) +# if consts[nc]['dim'] == 1: +# body_text = re.sub(varname+'(?!\w)', varname+'_ompkernel', body_text) +# else: +# body_text = re.sub('\*'+varname+'(?!\[)', varname+'[0]', body_text) +# body_text = re.sub(r''+varname+'\[([A-Za-z0-9]*)\]'+'', varname+r'_ompkernel[\1]', body_text) + + vec = 0 + for n in range(0,nargs): + if (vectorised[n] == 1): + vec = 1 + kernel_params = [ var.strip() for var in signature_text.split(',')] + if vec: + new_kernel_params = [] + for m in range(0,nargs_novec): + if int(kernels[nk]['idxs'][m])<0 and int(kernels[nk]['maps'][m]) == OP_MAP: + new_kernel_params = new_kernel_params + [kernel_params[m]]*int(-1*int(kernels[nk]['idxs'][m])) + else: + new_kernel_params = new_kernel_params + [kernel_params[m]] + kernel_params = new_kernel_params + + # collect constants used by kernel + kernel_consts = [] + for nc in range(0,len(consts)): + if body_text.find(consts[nc]['name']+'_ompkernel') != -1: + kernel_consts.append(nc) + +############################################################ +# omp4 function call definition +############################################################ + code('') + func_call_signaure_text = 'void ' + name + '_omp4_kernel(' + params = '' + indent = '\n' + ' ' + k = [] + for g_m in range(0, nargs): + if maps[g_m] == OP_GBL: + params += indent + rep(' *,',g_m) + if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): + k = k + [invmapinds[inds[g_m]-1]] + params += indent + 'int *map'+str(mapinds[g_m])+',' + if maptype == 'map': + params += indent + 'int map'+str(mapinds[g_m])+'size,' + if maps[g_m] == OP_ID: + params += indent + rep(' *data'+str(g_m)+',', g_m) + if maptype == 'map': + params += indent + 'int dat'+str(g_m)+'size,' + for m in range(1,ninds+1): + g_m = invinds[m-1] + params += indent + rep(' *data'+str(g_m)+',', g_m) + if maptype == 'map': + params += indent + 'int dat'+str(g_m)+'size,' + if ninds>0: + # add indirect kernel specific params to kernel func call + params += indent + 'int *col_reord,' + indent + 'int set_size1,' + indent + 'int start,' + indent + 'int end,' + else: + # add direct kernel specific params to kernel func call + params += indent + 'int count,' + params += indent + 'int num_teams,' + indent + 'int nthread' + #add strides for SoA to params + if any_soa: + indent = ','+indent + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + params += indent + 'int opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT' + + if dir_soa!=-1: + params += indent + 'int direct_'+name+'_stride_OP2CONSTANT' + if nopts>0: + params += ', int optflags' + code(func_call_signaure_text+params+');') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL: #and accs[g_m] <> OP_READ: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + + if nopts>0: + code('int optflags = 0;') + for i in range(0,nargs): + if optflags[i] == 1: + IF('args['+str(i)+'].opt') + code('optflags |= 1<<'+str(optidxs[i])+';') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('') + +# +# indirect bits +# + if ninds>0: + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + code('') + comm(' get plan') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges_cuda(set, nargs, args);') + +# +# get part and block size +# + code('') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('#ifdef OP_BLOCK_SIZE_'+ str(nk)) + code(' int nthread = OP_BLOCK_SIZE_'+str(nk)+';') + code('#else') + code(' int nthread = OP_block_size;') + code('#endif') + + code('') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL: #and accs[g_m]<>OP_READ: + if not dims[g_m].isdigit() or int(dims[g_m]) > 1: + print('ERROR: OpenMP 4 does not support multi-dimensional variables') + exit(-1) + code(' _l = h[0];') + + if ninds > 0: + code('') + code('int ncolors = 0;') + code('int set_size1 = set->size + set->exec_size;') + code('') + IF('set_size >0') + #managing constants + if any_soa: + code('') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(OP_kernels[' +str(nk)+ '].count==1) || (opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(g_m)+'))') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(g_m)+');') + code('opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1])+'_'+name+'_stride_OP2HOST;') + ENDIF() + if dir_soa!=-1: + IF('(OP_kernels[' +str(nk)+ '].count==1) || (direct_'+name+'_stride_OP2HOST != getSetSizeFromOpArg(&arg'+str(dir_soa)+'))') + code('direct_'+name+'_stride_OP2HOST = getSetSizeFromOpArg(&arg'+str(dir_soa)+');') + code('direct_'+name+'_stride_OP2CONSTANT = direct_'+name+'_stride_OP2HOST;') + ENDIF() + code('') + comm('Set up typed device pointers for OpenMP') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): + k = k + [invmapinds[inds[g_m]-1]] + code('int *map'+str(mapinds[g_m])+' = arg'+str(invmapinds[inds[g_m]-1])+'.map_data_d;') + if maptype == 'map': + code(' int map'+str(mapinds[g_m])+'size = arg'+str(invmapinds[inds[g_m]-1])+'.map->dim * set_size1;') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+'* data'+str(g_m)+' = ('+typs[g_m]+'*)arg'+str(g_m)+'.data_d;') + if maptype == 'map': + if optflags[g_m]: + code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') + else: + code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') + + for m in range(1,ninds+1): + g_m = invinds[m-1] + code(' *data'+str(g_m)+' = ( *).data_d;') + if maptype == 'map': + if optflags[g_m]: + code('int dat'+str(g_m)+'size = (arg'+str(g_m)+'.opt?1:0) * getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') + else: + code('int dat'+str(g_m)+'size = getSetSizeFromOpArg(&arg'+str(g_m)+') * arg'+str(g_m)+'.dat->dim;') + +# +# prepare kernel params for indirect version +# + if ninds>0: + code('') + code('op_plan *Plan = op_plan_get_stage(name,set,part_size,nargs,args,ninds,inds,OP_COLOR2);') + code('ncolors = Plan->ncolors;') + code('int *col_reord = Plan->col_reord;') + code('') + comm(' execute plan') + FOR('col','0','Plan->ncolors') + IF('col==1') + code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + code('int start = Plan->col_offsets[0][col];') + code('int end = Plan->col_offsets[0][col+1];') + code('') +# +# kernel function call +# + indent = '\n' + ' ' * (depth+2) + call_params = ','.join([ indent + re.sub(r'\*arg(\d+)',r'&arg\1_l',param.strip().split(' ')[-1]) for param in params.split(',')]) + call_params = call_params.replace('*','') + # set params for indirect version + if ninds>0: + call_params = call_params.replace('num_teams','part_size!=0?(end-start-1)/part_size+1:(end-start-1)/nthread') + # set params for direct version + else: + call_params = re.sub('count','set->size',call_params); + call_params = call_params.replace('num_teams','part_size!=0?(set->size-1)/part_size+1:(set->size-1)/nthread') + code(func_call_signaure_text.split(' ')[-1]+call_params+');') + code('') + + if ninds>0: + if reduct: + comm(' combine reduction data') + IF('col == Plan->ncolors_owned-1') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: + code('h[0] = _l;') + elif accs[g_m]==OP_MIN: + code('h[0] = MIN(h[0],_l);') + elif accs[g_m]==OP_MAX: + code('h[0] = MAX(h[0],_l);') + else: + error('internal error: invalid reduction option') + ENDIF() + ENDFOR() + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('set_size == 0 || set_size == set->core_size || ncolors == 1') + code('op_mpi_wait_all_cuda(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + if ninds==0: #direct version only + if accs[g_m]==OP_INC or accs[g_m]==OP_WRITE: + code('h[0] = _l;') + elif accs[g_m]==OP_MIN: + code('h[0] = MIN(h[0],_l);') + elif accs[g_m]==OP_MAX: + code('h[0] = MAX(h[0],_l);') + else: + print('internal error: invalid reduction option') + if typs[g_m] == 'double': #need for both direct and indirect + code('op_mpi_reduce_double(&,h);') + elif typs[g_m] == 'float': + code('op_mpi_reduce_float(&,h);') + elif typs[g_m] == 'int': + code('op_mpi_reduce_int(&,h);') + else: + print('Type '+typs[g_m]+' not supported in OpenMP4 code generator, please add it') + exit(-1) + + + code('op_mpi_set_dirtybit_cuda(nargs, args);') + code('') + +# +# update kernel record +# + + code('if (OP_diags>1) deviceSync();') + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + + depth -= 2 + code('}') + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('openmp4'): + os.makedirs('openmp4') + fid = open('openmp4/'+name+'_omp4kernel.cpp','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +############################################################## +# generate ****_omp4kernel_func.cpp +############################################################## + file_text = '' + + if CPP: + includes = op2_gen_common.extract_includes(kernel_text) + if len(includes) > 0: + for include in includes: + code(include) + code("") + + code(func_call_signaure_text+params+'){') + code('') + depth += 2 + for g_m in range(0, nargs): + if maps[g_m] == OP_GBL: + code(' _l = *;') + line = '#pragma omp target teams' + if op2_compiler == 'clang': + line +=' distribute parallel for schedule(static,1)\\\n' + (depth+2)*' ' + line +=' num_teams(num_teams) thread_limit(nthread) ' + map_clause = '' + if maptype == 'map': + map_clause = 'map(to:' + elif maptype == 'is_device_ptr': + map_clause = 'is_device_ptr(' + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if maptype == 'map': + map_clause += 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' + else: + map_clause += 'data'+str(g_m)+',' + if map_clause != 'is_device_ptr(' and map_clause != 'map(to:': + map_clause = map_clause[:-1]+')' + line += map_clause + # mapping global consts + if len(kernel_consts) != 0: + line += ' \\\n' + (depth+2)*' ' + 'map(to:' + for nc in kernel_consts: + line += ' ' + consts[nc]['name']+'_ompkernel,' + if consts[nc]['dim'] != 1: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + line = line[:-1] + '[:'+ num +'],' + line = line[:-1]+')' + # prepare reduction + reduction_string = '' + reduction_mapping = '' + if reduct: + reduction_mapping ='\\\n'+(depth+2)*' '+ 'map(tofrom:' + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + if accs[g_m] == OP_INC: + reduction_string += ' reduction(+:arg%d_l)' % g_m + reduction_mapping += ' arg%d_l,' % g_m + if accs[g_m] == OP_MIN: + reduction_string += ' reduction(min:arg%d_l)' % g_m + reduction_mapping += ' arg%d_l,' % g_m + if accs[g_m] == OP_MAX: + reduction_string += ' reduction(max:arg%d_l)' % g_m + reduction_mapping += ' arg%d_l,' % g_m + if accs[g_m] == OP_WRITE: + reduction_mapping += ' arg%d_l,' % g_m + reduction_mapping = reduction_mapping[0:-1]+')' +# +# map extra pointers for indirect version +# + if ninds>0: + if maptype == 'map': + line += '\\\n'+(depth+2)*' '+'map(to:col_reord[0:set_size1],' + else: + line += '\\\n'+(depth+2)*' '+'map(to:col_reord,' + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not invmapinds[inds[g_m]-1] in k): + k = k + [invmapinds[inds[g_m]-1]] + if maptype == 'map': + line = line + 'map'+str(mapinds[g_m])+'[0:map'+str(mapinds[g_m])+'size],' + else: + line = line + 'map'+str(mapinds[g_m])+',' + for m in range(1,ninds+1): + g_m = invinds[m-1] + if maptype == 'map': + line = line + 'data'+str(g_m)+'[0:dat'+str(g_m)+'size],' + else: + line = line + 'data'+str(g_m)+',' + line = line[:-1]+')' +# +# write omp pragma +# + code(line + reduction_mapping + reduction_string) + if op2_compiler != 'clang': + line = '#pragma omp distribute parallel for schedule(static,1)' + code(line + reduction_string) +# +# start for loop indirect version +# + if ninds>0: + FOR('e','start','end') + code('int n_op = col_reord[e];') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ + '[n_op + set_size1 * '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + IF('optflags & 1<<'+str(optidxs[g_m])) + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = map'+str(invmapinds[inds[g_m]-1])+\ + '[n_op + set_size1 * '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + if soaflags[g_m]: + line = line + indent + ' &data'+str(first)+'[map'+str(mapinds[g_m+k])+'idx],\n' + else: + line = line + indent + ' &data'+str(first)+'[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) +# +# direct version +# + else: + FOR('n_op','0','count') +# +# write inlined kernel function +# + comm('variable mapping') + for g_m in range(0,nargs): + line = kernel_params[g_m] + ' = ' + if maps[g_m] == OP_ID: + if soaflags[g_m]: + line += '&data%d[n_op]' % g_m + else: + line += '&data'+str(g_m)+'['+str(dims[g_m])+'*n_op]' + if maps[g_m] == OP_MAP: + if vectorised[g_m]: + if g_m+1 in unique_args: + line += 'arg'+str(g_m)+'_vec' + else: + line = '' + else: + if soaflags[g_m]: + line += '&data'+str(invinds[inds[g_m]-1])+'[map'+str(mapinds[g_m])+'idx]' + else: + line += '&data'+str(invinds[inds[g_m]-1])+'['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + line += '&arg%d_l' % g_m + if len(line): + line += ';' + code(line) + + code('') + comm('inline function') + indent = ' ' * (depth-2) + inline_body_text = '' + for line in body_text.split('\n'): + if len(line): + inline_body_text += indent+line+'\n' + else: + inline_body_text += '\n' + code(inline_body_text) + comm('end inline func') + + ENDFOR() + code('') + # end kernel function + for g_m in range(0, nargs): + if maps[g_m] == OP_GBL: + code('* = _l;') + depth -= 2; + code('}') + + +########################################################################## +# output individual omp4kernel file +########################################################################## + fid = open('openmp4/'+name+'_omp4kernel_func.cpp','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + comm(' header ') + code('#include "op_lib_cpp.h" ') + code('') + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_omp4kernel.cpp"') + master = master.split('.')[0] + fid = open('openmp4/'+master.split('.')[0]+'_omp4kernels.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + + +########################################################################## +# output omp4 master kernel file +########################################################################## + + file_text ='' + + comm(' global constants ') + + for nc in range (0,len(consts)): + if consts[nc]['dim']==1: + code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel;') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + code(consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'_ompkernel['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h" ') + code('') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + varname = consts[nc]['name'] + line = 2*' ' + 'memcpy(' + if consts[nc]['dim']==1: + line += '&' + line += varname+ '_ompkernel, dat, dim*sizeof('+consts[nc]['type'][1:-1]+'));\n' + indent + '#pragma omp target enter data map(to:'+varname+'_ompkernel' + if consts[nc]['dim'] !=1: + line += '[:%s]' % str(consts[nc]['dim']) if (consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0) else 'MAX_CONST_SIZE' + line += ')\n'+indent + code(line) + code('}') + code('') + + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_omp4kernel_func.cpp"') + master = master.split('.')[0] + fid = open('openmp4/'+master.split('.')[0]+'_omp4kernel_funcs.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + + diff --git a/translator/c/op2_gen_openmp_simple.py b/translator/c/op2_gen_openmp_simple.py new file mode 100644 index 000000000..98fac483b --- /dev/null +++ b/translator/c/op2_gen_openmp_simple.py @@ -0,0 +1,630 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import op2_gen_common + +insert_thread_timers = os.getenv('OP_TIME_THREADS', False); + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + + +def op2_gen_openmp_simple(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ and accs[i] != OP_WRITE: + j = i + reduct = j >= 0 + +########################################################################## +# start with the user kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + comm('user function') + + if FORTRAN: + code('include '+name+'.inc') + elif CPP: + code('#include "../'+decl_filepath+'"') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + for g_m in range (0,nargs): + if maps[g_m]==OP_GBL and accs[g_m] != OP_READ: + code('*h = ( *).data;') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + if insert_thread_timers: + code("op_timing_realloc_manytime({0}, {1});".format(str(nk), "omp_get_max_threads()")) + else: + code('op_timing_realloc('+str(nk)+');') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('op_timers_core(&cpu_t1, &wall_t1);') + if insert_thread_timers: + code('double non_thread_walltime = 0.0;') + code('') + +# +# indirect bits +# + if ninds>0: + code('int ninds = '+str(ninds)+';') + line = 'int inds['+str(nargs)+'] = {' + for m in range(0,nargs): + line += str(inds[m]-1)+',' + code(line[:-1]+'};') + code('') + + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + + code('') + comm(' get plan') + code('#ifdef OP_PART_SIZE_'+ str(nk)) + code(' int part_size = OP_PART_SIZE_'+str(nk)+';') + code('#else') + code(' int part_size = OP_part_size;') + code('#endif') + code('') + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + code('') + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + +# +# set number of threads in x86 execution and create arrays for reduction +# + + if reduct or ninds==0: + comm(' set number of threads') + code('#ifdef _OPENMP') + code(' int nthreads = omp_get_max_threads();') + code('#else') + code(' int nthreads = 1;') + code('#endif') + + if reduct: + code('') + comm(' allocate and initialise arrays for global reduction') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE: + code(' _l[nthreads*64];') + FOR('thr','0','nthreads') + if accs[g_m]==OP_INC: + FOR('d','0','') + code('_l[d+thr*64]=ZERO_;') + ENDFOR() + else: + FOR('d','0','') + code('_l[d+thr*64]=h[d];') + ENDFOR() + ENDFOR() + + code('') + IF('set_size >0') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + code('op_plan *Plan = op_plan_get_stage_upload(name,set,part_size,nargs,args,ninds,inds,OP_STAGE_ALL,0);') + code('') + comm(' execute plan') + code('int block_offset = 0;') + FOR('col','0','Plan->ncolors') + IF('col==Plan->ncolors_core') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + code('int nblocks = Plan->ncolblk[col];') + code('') + if insert_thread_timers: + # Pause process timing and switch to per-thread timing: + code('// Pause process timing and switch to per-thread timing:') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('non_thread_walltime += wall_t2 - wall_t1;') + + code('#pragma omp parallel') + code('{') + depth += 2 + code('double thr_wall_t1, thr_wall_t2, thr_cpu_t1, thr_cpu_t2;') + code('op_timers_core(&thr_cpu_t1, &thr_wall_t1);') + code('') + code('int nthreads = omp_get_num_threads();') + code('int thr = omp_get_thread_num();') + code('int thr_start = (nblocks * thr) / nthreads;') + code('int thr_end = (nblocks * (thr+1)) / nthreads;') + code('if (thr_end > nblocks) thr_end = nblocks;') + FOR('blockIdx','thr_start','thr_end') + else: + code('#pragma omp parallel for') + FOR('blockIdx','0','nblocks') + + code('int blockId = Plan->blkmap[blockIdx + block_offset];') + code('int nelem = Plan->nelems[blockId];') + code('int offset_b = Plan->offset[blockId];') + FOR('n','offset_b','offset_b+nelem') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + if vectorised[g_m]: + index = vectorised.index(vectorised[g_m]) + else: + index = g_m + IF('arg'+str(index)+'.opt') + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + line = line + indent + ' &((*)arg'+str(first)+'.data)[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) + code('') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + if vectorised[g_m]: + if g_m+1 in unique_args: + line = line + indent + 'arg'+str(g_m)+'_vec' + else: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(invinds[inds[g_m]-1])+'.data)['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]' + else: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + if g_m+1 in unique_args and not g_m+1 == unique_args[-1]: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + if insert_thread_timers: + depth -= 2 + code('}') + code('') + code('op_timers_core(&thr_cpu_t2, &thr_wall_t2);') + code('OP_kernels[' +str(nk)+ '].times[thr] += thr_wall_t2 - thr_wall_t1;') + ENDFOR() + code('') + + if reduct: + comm(' combine reduction data') + IF('col == Plan->ncolors_owned-1') + for m in range(0,nargs): + if maps[m] == OP_GBL and accs[m] != OP_READ: + FOR('thr','0','nthreads') + if accs[m]==OP_INC: + FOR('d','0','') + code('h[d] += _l[d+thr*64];') + ENDFOR() + elif accs[m]==OP_MIN: + FOR('d','0','') + code('h[d] = MIN(h[d],_l[d+thr*64]);') + ENDFOR() + elif accs[m]==OP_MAX: + FOR('d','0','') + code('h[d] = MAX(h[d],_l[d+thr*64]);') + ENDFOR() + else: + error('internal error: invalid reduction option') + ENDFOR() + ENDIF() + + if insert_thread_timers: + code('// Revert to process-level timing:') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + code('block_offset += nblocks;'); + ENDIF() + +# +# kernel call for direct version +# + else: + comm(' execute plan') + if insert_thread_timers: + # Pause process timing, and switch to per-thread timing: + code('// Pause process timing, and switch to per-thread timing:') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('non_thread_walltime += wall_t2 - wall_t1;') + code('#pragma omp parallel for') + FOR('thr','0','nthreads') + if insert_thread_timers: + code('double thr_wall_t1, thr_wall_t2, thr_cpu_t1, thr_cpu_t2;') + code('op_timers_core(&thr_cpu_t1, &thr_wall_t1);') + code('int start = (set->size* thr)/nthreads;') + code('int finish = (set->size*(thr+1))/nthreads;') + FOR('n','start','finish') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+'*n]' + if maps[g_m] == OP_GBL: + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + line = line + indent +'&arg'+str(g_m)+'_l[64*omp_get_thread_num()]' + else: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + if insert_thread_timers: + code('op_timers_core(&thr_cpu_t2, &thr_wall_t2);') + code('OP_kernels['+str(nk)+'].times[thr] += thr_wall_t2 - thr_wall_t1;') + ENDFOR() + if insert_thread_timers: + # OpenMP block complete, so switch back to process timing: + code('// OpenMP block complete, so switch back to process timing:') + code('op_timers_core(&cpu_t1, &wall_t1);') + + if ninds>0: + code('OP_kernels['+str(nk)+'].transfer += Plan->transfer;') + code('OP_kernels['+str(nk)+'].transfer2 += Plan->transfer2;') + + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('set_size == 0 || set_size == set->core_size') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads, direct version +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE and ninds==0: + FOR('thr','0','nthreads') + if accs[g_m]==OP_INC: + FOR('d','0','') + code('h[d] += _l[d+thr*64];') + ENDFOR() + elif accs[g_m]==OP_MIN: + FOR('d','0','') + code('h[d] = MIN(h[d],_l[d+thr*64]);') + ENDFOR() + elif accs[g_m]==OP_MAX: + FOR('d','0','') + code('h[d] = MAX(h[d],_l[d+thr*64]);') + ENDFOR() + else: + print('internal error: invalid reduction option') + ENDFOR() + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: + code('op_mpi_reduce(&,h);') + + code('op_mpi_set_dirtybit(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + if insert_thread_timers: + code('non_thread_walltime += wall_t2 - wall_t1;') + if insert_thread_timers: + code('OP_kernels[' +str(nk)+ '].times[0] += non_thread_walltime;') + else: + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + + depth -= 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('openmp'): + os.makedirs('openmp') + fid = open('openmp/'+name+'_kernel.cpp','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + code('#ifdef _OPENMP') + code(' #include ') + code('#endif') + code('') + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h" ') + code('') + + code('#ifndef SKIP_DECL_CONST') + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + code('}') + code('#endif') + code('') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_kernel.cpp"') + master = master.split('.')[0] + fid = open('openmp/'+master.split('.')[0]+'_kernels.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + + + + diff --git a/translator/c/op2_gen_seq.py b/translator/c/op2_gen_seq.py new file mode 100644 index 000000000..71e3965bc --- /dev/null +++ b/translator/c/op2_gen_seq.py @@ -0,0 +1,490 @@ +########################################################################## +# +# MPI Sequential code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.cpp for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import op2_gen_common + +def comm(line): + global file_text, FORTRAN, CPP + global depth + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line.rstrip()+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + if m < len(inddims): + line = re.sub('',str(inddims[m]),line) + line = re.sub('',str(indtyps[m]),line) + + line = re.sub('','ind_arg'+str(m),line) + line = re.sub('',str(dims[m]),line) + line = re.sub('','arg'+str(m),line) + line = re.sub('',typs[m],line) + line = re.sub('',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if text == '': + prefix = '' + else: + prefix = ' '*depth + file_text += prefix+rep(text,g_m).rstrip()+'\n' + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('do '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('enddo') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('if ('+line+') then') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('endif') + elif CPP: + code('}') + + +def op2_gen_seq(master, date, consts, kernels): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + grouped = 0 + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + + name, nargs, dims, maps, var, typs, accs, idxs, inds, soaflags, optflags, decl_filepath, \ + ninds, inddims, indaccs, indtyps, invinds, mapnames, invmapinds, mapinds, nmaps, nargs_novec, \ + unique_args, vectorised, cumulative_indirect_index = op2_gen_common.create_kernel_info(kernels[nk]) + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] +# +# set two logicals +# + j = 0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j > 0 + + j = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j > 0 + +########################################################################## +# start with the user kernel function +########################################################################## + + FORTRAN = 0; + CPP = 1; + g_m = 0; + file_text = '' + depth = 0 + + comm('user function') + + if FORTRAN: + code('include '+name+'.inc') + elif CPP: + code('#include "../'+decl_filepath+'"') + +########################################################################## +# then C++ stub function +########################################################################## + + code('') + comm(' host stub function') + code('void op_par_loop_'+name+'(char const *name, op_set set,') + depth += 2 + + for m in unique_args: + g_m = m - 1 + if m == unique_args[len(unique_args)-1]: + code('op_arg ){'); + code('') + else: + code('op_arg ,') + + code('int nargs = '+str(nargs)+';') + code('op_arg args['+str(nargs)+'];') + code('') + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + code('.idx = 0;') + code('args['+str(g_m)+'] = ;') + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + if (optflags[g_m] == 1): + argtyp = 'op_opt_arg_dat(arg'+str(first)+'.opt, ' + else: + argtyp = 'op_arg_dat(' + + FOR('v','1',str(sum(v))) + code('args['+str(g_m)+' + v] = '+argtyp+'arg'+str(first)+'.dat, v, arg'+\ + str(first)+'.map, , "", '+accsstring[accs[g_m]-1]+');') + ENDFOR() + code('') + elif vectorised[g_m]>0: + pass + else: + code('args['+str(g_m)+'] = ;') + +# +# start timing +# + code('') + comm(' initialise timers') + code('double cpu_t1, cpu_t2, wall_t1, wall_t2;') + code('op_timing_realloc('+str(nk)+');') + code('op_timers_core(&cpu_t1, &wall_t1);') + code('') + +# +# indirect bits +# + if ninds>0: + IF('OP_diags>2') + code('printf(" kernel routine with indirection: '+name+'\\n");') + ENDIF() + +# +# direct bit +# + else: + code('') + IF('OP_diags>2') + code('printf(" kernel routine w/o indirection: '+ name + '");') + ENDIF() + + code('') + if grouped: + code('int set_size = op_mpi_halo_exchanges_grouped(set, nargs, args, 1);') + else: + code('int set_size = op_mpi_halo_exchanges(set, nargs, args);') + + code('') + IF('set_size > 0') + code('') + +# +# kernel call for indirect version +# + if ninds>0: + FOR('n','0','set_size') + code('if (ncore_size && n>0 && n % OP_mpi_test_frequency == 0)') + code(' op_mpi_test_all(nargs,args);') + IF('n==set->core_size') + if grouped: + code('op_mpi_wait_all_grouped(nargs, args, 1);') + else: + code('op_mpi_wait_all(nargs, args);') + ENDIF() + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('int map'+str(mapinds[g_m])+'idx;') + #do non-optional ones + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k) and (not optflags[g_m]): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + #do optional ones + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + if optflags[g_m]: + if vectorised[g_m]: + index = vectorised.index(vectorised[g_m]) + else: + index = g_m + IF('arg'+str(index)+'.opt') + else: + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m])+'idx = arg'+str(invmapinds[inds[g_m]-1])+'.map_data[n * arg'+str(invmapinds[inds[g_m]-1])+'.map->dim + '+str(idxs[g_m])+'];') + if optflags[g_m]: + ENDIF() + + code('') + for g_m in range (0,nargs): + u = [i for i in range(0,len(unique_args)) if unique_args[i]-1 == g_m] + if len(u) > 0 and vectorised[g_m] > 0: + if accs[g_m] == OP_READ: + line = 'const * _vec[] = {\n' + else: + line = '* _vec[] = {\n' + + v = [int(vectorised[i] == vectorised[g_m]) for i in range(0,len(vectorised))] + first = [i for i in range(0,len(v)) if v[i] == 1] + first = first[0] + + indent = ' '*(depth+2) + for k in range(0,sum(v)): + line = line + indent + ' &((*)arg'+str(first)+'.data)[ * map'+str(mapinds[g_m+k])+'idx],\n' + line = line[:-2]+'};' + code(line) + code('') + + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+' * n]' + if maps[g_m] == OP_MAP: + if vectorised[g_m]: + if g_m+1 in unique_args: + line = line + indent + 'arg'+str(g_m)+'_vec' + else: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(invinds[inds[g_m]-1])+'.data)['+str(dims[g_m])+' * map'+str(mapinds[g_m])+'idx]' + if maps[g_m] == OP_GBL: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + if g_m+1 in unique_args and not g_m+1 == unique_args[-1]: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + +# +# kernel call for direct version +# + else: + FOR('n','0','set_size') + line = name+'(' + indent = '\n'+' '*(depth+2) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '&(('+typs[g_m]+'*)arg'+str(g_m)+'.data)['+str(dims[g_m])+'*n]' + if maps[g_m] == OP_GBL: + line = line + indent +'('+typs[g_m]+'*)arg'+str(g_m)+'.data' + if g_m < nargs-1: + line = line +',' + else: + line = line +');' + code(line) + ENDFOR() + + ENDIF() + code('') + + #zero set size issues + if ninds>0: + IF('set_size == 0 || set_size == set->core_size') + code('op_mpi_wait_all(nargs, args);') + ENDIF() + +# +# combine reduction data from multiple OpenMP threads +# + comm(' combine reduction data') + for g_m in range(0,nargs): + if maps[g_m]==OP_GBL and accs[g_m]!=OP_READ: +# code('op_mpi_reduce(&,('+typs[g_m]+'*).data);') + if typs[g_m] == 'double': #need for both direct and indirect + code('op_mpi_reduce_double(&,('+typs[g_m]+'*).data);') + elif typs[g_m] == 'float': + code('op_mpi_reduce_float(&,('+typs[g_m]+'*).data);') + elif typs[g_m] == 'int': + code('op_mpi_reduce_int(&,('+typs[g_m]+'*).data);') + else: + print('Type '+typs[g_m]+' not supported in OpenACC code generator, please add it') + exit(-1) + + code('op_mpi_set_dirtybit(nargs, args);') + code('') + +# +# update kernel record +# + + comm(' update kernel record') + code('op_timers_core(&cpu_t2, &wall_t2);') + code('OP_kernels[' +str(nk)+ '].name = name;') + code('OP_kernels[' +str(nk)+ '].count += 1;') + code('OP_kernels[' +str(nk)+ '].time += wall_t2 - wall_t1;') + + if ninds == 0: + line = 'OP_kernels['+str(nk)+'].transfer += (float)set->size *' + + for g_m in range (0,nargs): + if optflags[g_m]==1: + IF('.opt') + if maps[g_m]!=OP_GBL: + if accs[g_m]==OP_READ: + code(line+' .size;') + else: + code(line+' .size * 2.0f;') + if optflags[g_m]==1: + ENDIF() + else: + names = [] + for g_m in range(0,ninds): + mult='' + if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[invinds[g_m]] in names: + if optflags[g_m]==1: + IF('arg'+str(invinds[g_m])+'.opt') + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[g_m])+'.size'+mult+';') + if optflags[g_m]==1: + ENDIF() + names = names + [var[invinds[g_m]]] + for g_m in range(0,nargs): + mult='' + if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: + mult = ' * 2.0f' + if not var[g_m] in names: + names = names + [var[g_m]] + if optflags[g_m]==1: + IF('.opt') + if maps[g_m] == OP_ID: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + elif maps[g_m] == OP_GBL: + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(g_m)+'.size'+mult+';') + if optflags[g_m]==1: + ENDIF() + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('OP_kernels['+str(nk)+'].transfer += (float)set->size * arg'+str(invinds[inds[g_m]-1])+'.map->dim * 4.0f;') + + depth -= 2 + code('}') + + +########################################################################## +# output individual kernel file +########################################################################## + if not os.path.exists('seq'): + os.makedirs('seq') + fid = open('seq/'+name+'_seqkernel.cpp','w') + date = datetime.datetime.now() + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() + +# end of main kernel call loop + + +########################################################################## +# output one master kernel file +########################################################################## + + file_text ='' + + comm(' global constants ') + + for nc in range (0,len(consts)): + if not consts[nc]['user_declared']: + if consts[nc]['dim']==1: + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+';') + else: + if consts[nc]['dim'].isdigit() and int(consts[nc]['dim']) > 0: + num = str(consts[nc]['dim']) + else: + num = 'MAX_CONST_SIZE' + code('extern '+consts[nc]['type'][1:-1]+' '+consts[nc]['name']+'['+num+'];') + code('') + + comm(' header ') + + if os.path.exists('./user_types.h'): + code('#include "../user_types.h"') + code('#include "op_lib_cpp.h" ') + code('') + + for nc in range(0,len(consts)): + code('') + code('void op_decl_const_'+consts[nc]['name']+'(int dim, char const *type,') + code(' '+consts[nc]['type'][1:-1]+' *dat){') + code('}') + code('') + + comm(' user kernel files') + + for nk in range(0,len(kernels)): + code('#include "'+kernels[nk]['name']+'_seqkernel.cpp"') + master = master.split('.')[0] + fid = open('seq/'+master.split('.')[0]+'_seqkernels.cpp','w') + fid.write('//\n// auto-generated by op2.py\n//\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/c/op2_seq_gen.py b/translator/c/op2_seq_gen.py new file mode 100755 index 000000000..c7240edca --- /dev/null +++ b/translator/c/op2_seq_gen.py @@ -0,0 +1,202 @@ +#!/usr/bin/env python +####################################################################### +# # +# This Python routine generates the header file op_seq.h # +# # +####################################################################### + + +# +# this sets the max number of arguments in op_par_loop +# +maxargs = 20 + +#open/create file +f = open('./op_seq.h','w') + +# +#first the top bit +# + +top = """ +// +// header for sequential and MPI+sequentional execution +// + +#include "op_lib_cpp.h" + +static int op2_stride = 1; +#define OP2_STRIDE(arr, idx) arr[idx] + +// scratch space to use for double counting in indirect reduction +static int blank_args_size = 512; +static char* blank_args = (char *)op_malloc(blank_args_size); + +inline void op_arg_set(int n, op_arg arg, char **p_arg, int halo){ + *p_arg = arg.data; + + if (arg.argtype==OP_ARG_GBL) { + if (halo && (arg.acc != OP_READ)) *p_arg = blank_args; + } + else { + if (arg.map==NULL || arg.opt==0) // identity mapping + *p_arg += arg.size*n; + else // standard pointers + *p_arg += arg.size*arg.map->map[arg.idx+n*arg.map->dim]; + } +} + +inline void op_arg_copy_in(int n, op_arg arg, char **p_arg) { + for (int i = 0; i < -1*arg.idx; ++i) + p_arg[i] = arg.data + arg.map->map[i+n*arg.map->dim]*arg.size; +} + +inline void op_args_check(op_set set, int nargs, op_arg *args, + int *ninds, const char *name) { + for (int n=0; n\n') + if n%n_per_line == 3 and n != nargs-1: + f.write('\n ') + + f.write('void op_par_loop(void (*kernel)(') + for n in range (0, nargs): + f.write('T'+str(n)+'*') + if nargs != 1 and n != nargs-1: + f.write(', ') + else: + f.write('),\n') + if n%n_per_line == 3 and n != nargs-1: + f.write('\n ') + + f.write(' char const * name, op_set set,\n ') + for n in range (0, nargs): + f.write('op_arg arg'+str(n)) + if nargs != 1 and n != nargs-1: + f.write(', ') + else: + f.write('){\n') + if n%n_per_line == 3 and n != nargs-1: + f.write('\n ') + + f.write('\n char *p_a['+str(nargs)+'] = {') + for n in range (0, nargs): + f.write('0') + if nargs != 1 and n != nargs-1: + f.write(',') + else: + f.write('};\n') + + f.write(' op_arg args['+str(nargs)+'] = {') + for n in range (0, nargs): + f.write('arg'+str(n)) + if nargs != 1 and n != nargs-1: + f.write(', ') + else: + f.write('};\n') + if n%n_per_line == 3 and n != nargs-1: + f.write('\n ') + + for n in range (0, nargs): + f.write(' if(arg'+str(n)+'.idx < -1) {\n') + f.write(' p_a['+str(n)+'] = (char *)op_malloc(-1*args['+str(n)+'].idx*sizeof(T'+str(n)+'));\n }\n') + + f.write('\n //allocate scratch mememory to do double counting in indirect reduction\n') + f.write(' for (int i = 0; i<'+str(nargs)+';i++)\n') + f.write(' if(args[i].argtype == OP_ARG_GBL && args[i].size > blank_args_size )\n') + f.write(' {\n') + f.write(' blank_args_size = args[i].size;\n'); + f.write(' blank_args = (char *)op_malloc(blank_args_size);\n') + f.write(' }\n') + + f.write(' // consistency checks\n') + f.write(' int ninds = 0;\n') + + f.write(' if (OP_diags>0) op_args_check(set,'+str(nargs)+',args,&ninds,name);\n\n') + + f.write(' if (OP_diags>2) {\n') + f.write(' if (ninds==0)\n') + f.write(' printf(" kernel routine w/o indirection: %s\\n",name);\n') + f.write(' else\n') + f.write(' printf(" kernel routine with indirection: %s\\n",name);\n') + f.write(' }\n') + + f.write(' // initialise timers\n') + f.write(' double cpu_t1, cpu_t2, wall_t1, wall_t2;\n') + f.write(' op_timers_core(&cpu_t1, &wall_t1);\n\n') + + f.write(' // MPI halo exchange and dirty bit setting, if needed\n') + f.write(' int n_upper = op_mpi_halo_exchanges(set, '+str(nargs)+', args);\n\n') + f.write(' // loop over set elements\n') + f.write(' int halo = 0; \n\n') + + f.write(' for (int n=0; ncore_size) op_mpi_wait_all('+str(nargs)+',args);\n') + f.write(' if (n==set->size) halo = 1;\n') + + for n in range (0, nargs): + f.write(' if (args['+str(n)+'].idx < -1) op_arg_copy_in(n,args['+str(n)+'], (char **)p_a['+str(n)+']);\n') + f.write(' else op_arg_set(n,args['+str(n)+'], &p_a['+str(n)+'],halo);\n') + + f.write('\n kernel( ') + for n in range (0, nargs): + f.write('(T'+str(n)+' *)p_a['+str(n)+']') + if nargs != 1 and n != nargs-1: + f.write(', ') + else: + f.write(');\n') + if n%n_per_line == 3 and n != nargs-1: + f.write('\n ') + + + f.write(' }\n') + f.write(' if ( n_upper == set->core_size || n_upper == 0 )\n op_mpi_wait_all ('+str(nargs)+',args);\n\n') + f.write(' //set dirty bit on datasets touched\n') + f.write(' op_mpi_set_dirtybit('+str(nargs)+', args);\n\n') + + f.write(' //global reduction for MPI execution, if needed \n') + f.write(' //p_a simply used to determine type for MPI reduction\n') + for n in range (0, nargs): + f.write(' op_mpi_reduce(&arg'+str(n)+',(T'+str(n)+' *)p_a['+str(n)+']);\n') + + f.write('\n // update timer record\n') + f.write(' op_timers_core(&cpu_t2, &wall_t2);\n') + f.write('#ifdef COMM_PERF\n') + f.write(' void *k_i = op_mpi_perf_time(name, wall_t2 - wall_t1);\n') + f.write(' op_mpi_perf_comms(k_i, '+str(nargs)+', args);\n') + f.write('#else\n') + f.write(' op_mpi_perf_time(name, wall_t2 - wall_t1);\n') + f.write('#endif\n\n') + + for n in range (0, nargs): + f.write(' if(arg'+str(n)+'.idx < -1) {\n') + f.write(' free(p_a['+str(n)+']);\n'); + f.write(' }\n'); + + + f.write('}\n') + +f.close() diff --git a/translator/fortran/README.md b/translator/fortran/README.md new file mode 100644 index 000000000..3b6bc484c --- /dev/null +++ b/translator/fortran/README.md @@ -0,0 +1,38 @@ +### Fortran Code Generators +This directory contains the OP2 code generators written in Python targeting the Fortran API. The parallelisations and optimisations supported by each generator are as follows: + +##### MPI+SEQ + * `op2_gen_mpiseq.py`: Generate host stubs for MPI+SEQ. + * `op2_gen_mpiseq3.py`: Generate host stubs for MPI+SEQ -- optimised by removing the overhead due to Fortran C to F pointer setups. + * `op2_gen_mpivec.py`: Generate host stubs for MPI+SEQ with intel vectorization optimisations. + +##### OpenMP + * `op2_gen_openmp3.py`: Optimised by removing the overhead due to Fortran C to F pointer setups. + * `op2_gen_openmp2.py`: Version without staging. + * `op2_gen_openmp.py`: Original version - one that most OP2 papers refer to. + +##### CUDA + * `op2_gen_cuda.py` + * `op2_gen_cuda_permute.py`: Permute does a different coloring (permute execution within blocks by color). + * `op2_gen_cudaINC.py`: Stages increment data only in shared memory. + * `op2_gen_cuda_old.py`: Code generator targettign Fermi GPUs. + +##### If hydra: + * `op2_gen_cuda_hydra()`: Includes several Hydra specific features. + +#### Invoking the Code Generator +Uncomment the parallelization you want to code generate in `op2_fortran.py`. For example for CUDA code generation do: +``` +#op2_gen_openmp(str(sys.argv[init_ctr]), date, consts, kernels, hydra) +op2_gen_cuda(str(sys.argv[1]), date, consts, kernels, hydra) +``` + +Make `./op2_fortran.py` executable +``` +chmod a+x ./op2_fortran.py +``` + +Invoke the code generator by supplying the files that contain op_* API calls. Thus for example for Airfoil do the following. +``` +./op2_fortran.py airfoil.F90 +``` diff --git a/translator/fortran/op2_fortran.py b/translator/fortran/op2_fortran.py new file mode 100755 index 000000000..2065274f5 --- /dev/null +++ b/translator/fortran/op2_fortran.py @@ -0,0 +1,1059 @@ +#!/usr/bin/env python3 + +""" + OP2 source code transformation tool + + This tool parses the user's original source code to produce + target-specific code to execute the user's kernel functions. + + This prototype is written in Python and is directly based on the + parsing and code generation of the matlab source code transformation code + + usage: ./op2_fortran.py 'file1','file2',... + + This code generator is for parsing applications written using the OP2 FORTRAN API + + This takes as input + + file1.F90, file2.F90, ... + + and produces as output modified versions ..... + + file1_op.F90, file2_op.F90, ... + + then calls a number of target-specific code generators + to produce individual kernel files of the form + + xxx_kernel.F90 -- for OpenMP x86 execution + xxx_kernel.CUF -- for CUDA execution (based on PGI CUDA FORTRAN) + +""" + +import sys +import re +import datetime + +#import openmp code generation function +import op2_gen_openmp +from op2_gen_openmp import * +import op2_gen_openmp2 +from op2_gen_openmp2 import * +import op2_gen_openmp3 +from op2_gen_openmp3 import * + +import op2_gen_openacc +from op2_gen_openacc import * + +import op2_gen_openmp4 +from op2_gen_openmp4 import * + + + +#import mpiseq code generation function +import op2_gen_mpiseq +from op2_gen_mpiseq import * +import op2_gen_mpiseq2 +from op2_gen_mpiseq2 import * +import op2_gen_mpiseq3 +from op2_gen_mpiseq3 import * +import op2_gen_mpivec +from op2_gen_mpivec import * + + +#import cuda code generation function +import op2_gen_cuda +from op2_gen_cuda import * +import op2_gen_cuda_gbl +from op2_gen_cuda_gbl import * +import op2_gen_cuda_color2 +from op2_gen_cuda_color2 import * +import op2_gen_cuda_permute +from op2_gen_cuda_permute import * +import op2_gen_cudaINC +from op2_gen_cudaINC import * +import op2_gen_cuda_old +from op2_gen_cuda_old import * + + +# +# declare constants +# + +ninit = 0; nexit = 0; npart = 0; nhdf5 = 0; nconsts = 0; nkernels = 0; +consts = [] +kernels = [] + +OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + +OP_READ = 1; OP_WRITE = 2; OP_RW = 3; +OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + +OP_accs_labels = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + +global file_format, cont, comment + +file_format = 0 +cont = '& ' +comment = '! ' + +hydra = 0 +bookleaf=0 + +# from http://stackoverflow.com/a/241506/396967 +########################################################################## +# Remove comments from text +########################################################################## + +def comment_remover(text): + + def replacer(match): + s = match.group(0) + if s.startswith('/'): + return "" + else: + return s + pattern = re.compile( + r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"', + re.DOTALL | re.MULTILINE + ) + return re.sub(pattern, replacer, text) + + +########################################################################## +# parsing for op_init/op_exit/op_partition/op_hdf5 calls +########################################################################## + +def op_parse_calls(text): + + # remove comments just for this call + text = comment_remover(text) + + inits = len(re.findall('op_init', text)) + exits = len(re.findall('op_exit', text)) + parts = len(re.findall('op_partition', text)) + hdf5s = len(re.findall('hdf5', text)) + + return (inits, exits, parts, hdf5s) + + +########################################################################## +# parsing for op_decl_const calls +########################################################################## + +def op_decl_const_parse(text): + + consts = [] + for m in re.finditer('call(.+)op_decl_const(.*)\((.*)\)', text): + args = m.group(3).split(',') + + # check for syntax errors + if len(args) != 3: + print('Error in op_decl_const : must have three arguments') + return + + consts.append({ + 'loc': m.start(), + 'dim': args[1].strip(), + 'type': args[1].strip(), + 'name': args[0].strip(), + 'name2': args[2].strip() + }) + + return consts + +########################################################################## +# parsing for arguments in op_par_loop to find the correct closing brace +########################################################################## +def arg_parse(text,j): + + depth = 0 + loc2 = j; + while 1: + if text[loc2] == '(': + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def arg_parse2(text, j): + """Parsing arguments in op_par_loop to find the correct closing brace""" + + depth = 0 + loc2 = j + arglist = [] + prev_start = j + while 1: + if text[loc2] == '(': + if depth == 0: + prev_start = loc2+1 + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + arglist.append(text[prev_start:loc2].strip()) + return arglist + + elif text[loc2] == ',': + if depth == 1: + arglist.append(text[prev_start:loc2].strip()) + prev_start = loc2+1 + elif text[loc2] == '{': + depth = depth + 1 + elif text[loc2] == '}': + depth = depth - 1 + loc2 = loc2 + 1 + +def typechange(text): + if '"INTEGER(kind=4)"' in text: + return '"i4"' + elif '"INTEGER(kind=4):soa"' in text: + return '"i4:soa"' + elif '"REAL(kind=8)"' in text: + return '"r8"' + elif '"REAL(kind=8):soa"' in text: + return '"r8:soa"' + elif '"REAL(kind=4)"' in text: + return '"r4"' + elif '"REAL(kind=4):soa"' in text: + return '"r4:soa"' + elif '"logical"' in text: + return '"logical"' + return text + + +def get_arg_dat(arg_string, j): + loc = arg_parse(arg_string,j+1) + dat_args_string = arg_string[arg_string.find('(',j):loc+1] + + #remove comments + dat_args_string = comment_remover(dat_args_string) + dat_args_string = dat_args_string.replace('&','') + + args = arg_parse2(dat_args_string,0) + #check for syntax errors + if len(args) != 6: + print('Error parsing op_arg_dat(%s): must have six arguments' \ + % dat_args_string) + print(args) + return + + # split the dat_args_string into 6 and create a struct with the elements + # and type as op_arg_dat + temp_dat = {'type':'op_arg_dat', + 'dat':args[0].strip(), + 'idx':args[1].strip(), + 'map':args[2].strip(), + 'dim':args[3].strip(), + 'typ':args[4].strip(), + 'acc':args[5].strip(), + 'opt':''} + +# if 'DNPDE' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('DNPDE','6') +# if 'npdes' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('npdes','DNPDE') +# if 'nfcrow' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('nfcrow','DNFCROW') +# if 'ntqmu' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('ntqmu','DNTQMU') +# if temp_dat['dim']=='njaca': +# temp_dat['dim']='1'#'1*1' +# if 'mpdes' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('mpdes','10') +# if 'maxgrp' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('maxgrp','1000') +# if temp_dat['dim']=='njacs': +# temp_dat['dim']='1'#'1*1' + if '"r8' in temp_dat['typ']: + temp_dat['typ']= temp_dat['typ'].replace('"r8','"REAL(kind=8)') + if '"i4' in temp_dat['typ']: + temp_dat['typ']= temp_dat['typ'].replace('"i4','"INTEGER(kind=4)') + if temp_dat['typ']=='"logical"': + temp_dat['typ']='"logical"' + return temp_dat + +def get_opt_arg_dat(arg_string, j): + loc = arg_parse(arg_string,j+1) + dat_args_string = arg_string[arg_string.find('(',j):loc+1] + + #remove comments + dat_args_string = comment_remover(dat_args_string) + dat_args_string = dat_args_string.replace('&','') + + args = arg_parse2(dat_args_string,0) + + #check for syntax errors + if len(args) != 7: + print('Error parsing op_opt_arg_dat(%s): must have 7 arguments' \ + % dat_args_string) + return + + # split the dat_args_string into 6 and create a struct with the elements + # and type as op_arg_dat + temp_dat = {'type':'op_opt_arg_dat', + 'opt':args[0].strip(), + 'dat':args[1].strip(), + 'idx':args[2].strip(), + 'map':args[3].strip(), + 'dim':args[4].strip(), + 'typ':args[5].strip(), + 'acc':args[6].strip()} + +# if 'DNPDE' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('DNPDE','6') +# if 'npdes' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('npdes','NPDE') +# if 'ntqmu' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('ntqmu','DNTQMU') +# if 'nfcrow' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('nfcrow','DNFCROW') +# if temp_dat['dim']=='njaca': +# temp_dat['dim']='1'#'1*1' +# if temp_dat['dim']=='njacs': +# temp_dat['dim']='1'#'1*1' +# if 'mpdes' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('mpdes','10') +# if 'maxgrp' in temp_dat['dim']: +# temp_dat['dim'] = temp_dat['dim'].replace('maxgrp','1000') + if '"r8' in temp_dat['typ']: + temp_dat['typ']= temp_dat['typ'].replace('"r8','"REAL(kind=8)') + if '"i4' in temp_dat['typ']: + temp_dat['typ']= temp_dat['typ'].replace('"i4','"INTEGER(kind=4)') + if temp_dat['typ']=='"logical"': + temp_dat['typ']='"logical"' + + return temp_dat + +def get_arg_gbl(arg_string, k): + loc = arg_parse(arg_string,k+1) + gbl_args_string = arg_string[arg_string.find('(',k)+1:loc] + + #remove comments + gbl_args_string = comment_remover(gbl_args_string) + gbl_args_string = gbl_args_string.replace('&','') + + gbl_args = arg_parse2('('+gbl_args_string+')',0) + #check for syntax errors + if len(gbl_args) != 4: + print('Error parsing op_arg_gbl(%s): must have four arguments' \ + % gbl_args_string) + return + + # split the gbl_args_string into 4 and create a struct with the elements + # and type as op_arg_gbl + temp_gbl = {'type':'op_arg_gbl', + 'data':gbl_args[0].strip(), + 'dim':gbl_args[1].strip(), + 'typ':gbl_args[2].strip(), + 'acc':gbl_args[3].strip(), + 'opt':''} + +# if 'DNPDE' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('DNPDE','6') +# if 'nfcrow' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('nfcrow','DNFCROW') +# if 'npdes' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('npdes','NPDE') +# if 'ntqmu' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('ntqmu','DNTQMU') +# if 'maxzone' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('maxzone','DMAXZONE') +# if 'mpdes' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('mpdes','10') +# if 'maxgrp' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('maxgrp','1000') + if temp_gbl['typ']=='"r8"': + temp_gbl['typ']='"REAL(kind=8)"' + if temp_gbl['typ']=='"i4"': + temp_gbl['typ']='"INTEGER(kind=4)"' + if temp_gbl['typ']=='"logical"': + temp_gbl['typ']='"logical"' + + return temp_gbl + +def get_opt_arg_gbl(arg_string, k): + loc = arg_parse(arg_string,k+1) + gbl_args_string = arg_string[arg_string.find('(',k)+1:loc] + + #remove comments + gbl_args_string = comment_remover(gbl_args_string) + gbl_args_string = gbl_args_string.replace('&','') + + gbl_args = arg_parse2('('+gbl_args_string+')',0) + #check for syntax errors + if len(gbl_args) != 5: + print('Error parsing op_arg_gbl(%s): must have five arguments' \ + % gbl_args_string) + return + + # split the gbl_args_string into 4 and create a struct with the elements + # and type as op_arg_gbl + temp_gbl = {'type':'op_opt_arg_gbl', + 'opt' :gbl_args[0].strip(), + 'data':gbl_args[1].strip(), + 'dim' :gbl_args[2].strip(), + 'typ' :gbl_args[3].strip(), + 'acc' :gbl_args[4].strip()} + +# if 'DNPDE' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('DNPDE','6') +# if 'nfcrow' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('nfcrow','DNFCROW') +# if 'npdes' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('npdes','NPDE') +# if 'ntqmu' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('ntqmu','DNTQMU') +# if 'maxzone' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('maxzone','DMAXZONE') +# if 'mpdes' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('mpdes','10') +# if 'maxgrp' in temp_gbl['dim']: +# temp_gbl['dim'] = temp_gbl['dim'].replace('maxgrp','1000') + if temp_gbl['typ']=='"r8"': + temp_gbl['typ']='"REAL(kind=8)"' + if temp_gbl['typ']=='"i4"': + temp_gbl['typ']='"INTEGER(kind=4)"' + if temp_gbl['typ']=='"logical"': + temp_gbl['typ']='"logical*1"' + + return temp_gbl + +def append_init_soa(text): + text = re.sub('\\bop_init(\\w*)\\b\\s*\((.*)\)','op_init\\1_soa(\\2,1)', text) + text = re.sub('\\bop_mpi_init(\\w*)\\b\\s*\((.*)\)','op_mpi_init\\1_soa(\\2,1)', text) + return text +########################################################################## +# parsing for op_par_loop calls +########################################################################## + +def op_par_loop_parse(text): + loop_args = [] + + search = "op_par_loop" + i = text.find(search) + while i > -1: + arg_string = text[text.find('(',i)+1:arg_parse(text,i+11)] + parloop_args = arg_parse2(text, i+11) + #parse arguments in par loop + temp_args = [] + num_args = 0 + + #try: + #parse each op_arg_dat + search2 = "op_arg_dat" + search3 = "op_arg_gbl" + search4 = "op_opt_arg_dat" + search5 = "op_opt_arg_gbl" + j = arg_string.find(search2) + k = arg_string.find(search3) + l = arg_string.find(search4) + p = arg_string.find(search5) + + while j > -1 or k > -1 or l > -1 or p > -1: + index = min(j if (j > -1) else sys.maxsize,k if (k > -1) else sys.maxsize,l if (l > -1) else sys.maxsize, p if (p > -1) else sys.maxsize) + + if index == j: + temp_dat = get_arg_dat(arg_string,j) + #append this struct to a temporary list/array + temp_args.append(temp_dat) + num_args = num_args + 1 + j= arg_string.find(search2, j+11) + elif index == k: + temp_gbl = get_arg_gbl(arg_string,k) + #append this struct to a temporary list/array + temp_args.append(temp_gbl) + num_args = num_args + 1 + k= arg_string.find(search3, k+11) + elif index == l : + temp_dat = get_opt_arg_dat(arg_string,l) + #append this struct to a temporary list/array + temp_args.append(temp_dat) + num_args = num_args + 1 + l = arg_string.find(search4, l+15) + elif index == p : + temp_dat = get_opt_arg_gbl(arg_string,p) + #append this struct to a temporary list/array + temp_args.append(temp_dat) + num_args = num_args + 1 + p = arg_string.find(search5, p+15) + + temp = {'loc':i, + 'name1':parloop_args[0].strip(), + 'set':parloop_args[1].strip(), + 'args':temp_args, + 'nargs':num_args} + + loop_args.append(temp) + i=text.find(search, i+10) + + print('\n\n') + return (loop_args) + +###################END OF FUNCTIONS DECLARATIONS ######################### + + + +########################################################################## +# ** BEGIN MAIN APPLICATION ** +########################################################################## + +#####################loop over all input source files##################### +init_ctr = 1 +auto_soa=os.getenv('OP_AUTO_SOA','0') +if len(sys.argv) > 1: + if sys.argv[1] == 'hydra': + hydra = 1 + init_ctr=2 + if sys.argv[1] == 'bookleaf': + bookleaf = 1 + init_ctr=2 + +for a in range(init_ctr,len(sys.argv)): + print('processing file '+ str(a) + ' of ' + str(len(sys.argv)-init_ctr) + ' '+ \ + str(sys.argv[a])) + + src_file = str(sys.argv[a]) + f = open(src_file,'r') + text = f.read() + if src_file.split('.')[1].upper() == 'F90' or src_file.split('.')[1].upper() == 'F95': + file_format = 90 + cont = '& ' + cont_end = ' &' + comment = '! ' + elif src_file.split('.')[1].upper() == 'F77' or src_file.split('.')[1].upper() == 'F': + file_format = 77 + cont = '& ' + cont_end = '' + comment = 'C ' + else: + print("Error in parsing file: unsupported file format, only *.F90, *.F95 or *.F77 supported") + exit() + +############ check for op_init/op_exit/op_partition/op_hdf5 calls ######## + + inits, exits, parts, hdf5s = op_parse_calls(text) + + if inits+exits+parts+hdf5s > 0: + print(' ') + if inits > 0: + print('contains op_init call') + if auto_soa!='0': + text = append_init_soa(text) + if exits > 0: + print('contains op_exit call') + if parts > 0: + print('contains op_partition call') + if hdf5s > 0: + print('contains op_hdf5 calls') + + ninit = ninit + inits + nexit = nexit + exits + npart = npart + parts + nhdf5 = nhdf5 + hdf5s + +########################## parse and process constants ################### + + const_args = [] + if not hydra: + const_args = op_decl_const_parse(text) + + #cleanup '&' symbols from name and convert dim to integer + for i in range(0,len(const_args)): + if const_args[i]['name'][0] == '&': + const_args[i]['name'] = const_args[i]['name'][1:] + const_args[i]['dim'] = int(const_args[i]['dim']) + + #check for repeats + nconsts = 0 + for i in range(0,len(const_args)): + repeat = 0 + name = const_args[i]['name'] + for c in range(0,nconsts): + if const_args[i]['name'] == consts[c]['name']: + repeat = 1 + if const_args[i]['type'] != consts[c]['type']: + print('type mismatch in repeated op_decl_const') + if const_args[i]['dim'] != consts[c]['dim']: + print('size mismatch in repeated op_decl_const') + + if repeat > 0: + print('repeated global constant ' + const_args[i]['name']) + else: + print('\nglobal constant ('+ const_args[i]['name'].strip() + ') of size ' \ + + str(const_args[i]['dim'] + ' and type ' + const_args[i]['type'].strip())) + + #store away in master list + if repeat == 0: + nconsts = nconsts + 1 + temp = {'dim': const_args[i]['dim'], + 'type': const_args[i]['type'].strip(), + 'name': const_args[i]['name'].strip()} + consts.append(temp) + +###################### parse and process op_par_loop calls ############### + + loop_args = op_par_loop_parse(text) + + for i in range (0, len(loop_args)): + name = loop_args[i]['name1'] + set_name = loop_args[i]['set'] + nargs = loop_args[i]['nargs'] + code_loc = loop_args[i]['loc'] + print('\nprocessing kernel '+name+' with '+str(nargs)+' arguments', end=' ') + +# +# process arguments +# + +# +# NOTE: Carlo's FORTRAN API has one fewer arguments than C++ API +# + var = ['']*nargs + idxs = [0]*nargs + dims = ['']*nargs + maps = [0]*nargs + mapnames = ['']*nargs + typs = ['']*nargs + accs = [0]*nargs + soaflags = [0]*nargs + optflags= [0]*nargs + + for m in range (0,nargs): + arg_type = loop_args[i]['args'][m]['type'] + args = loop_args[i]['args'][m] + + if arg_type.strip() == 'op_arg_dat' or arg_type.strip() == 'op_opt_arg_dat': + var[m] = args['dat'] + idxs[m] = args['idx'] + if str(args['map']).strip() == 'OP_ID': + maps[m] = OP_ID + if int(idxs[m]) != -1: + print('invalid index for argument'+str(m)) + else: + maps[m] = OP_MAP + mapnames[m] = str(args['map']).strip() + + dims[m] = args['dim'] + soa_loc = args['typ'].find(':soa') + if ((auto_soa=='1') and (((not dims[m].isdigit()) or int(dims[m])>1)) and (soa_loc < 0)): + soa_loc = len(args['typ'])-1 + + if soa_loc > 0: + soaflags[m] = 1 + typs[m] = args['typ'][1:soa_loc] + else: + typs[m] = args['typ'][1:-1] + + l = -1 + for l in range(0,len(OP_accs_labels)): + if args['acc'].strip() == OP_accs_labels[l].strip(): + break + + if l == -1: + print('unknown access type for argument '+str(m)) + else: + accs[m] = l+1 + + if arg_type.strip() == 'op_opt_arg_dat': + optflags[m] = 1 + # if soaflags[m] == 1: + # print "ERROR: cannot have SoA and optional argument at the same time" + # sys.exit(-1) + else: + optflags[m] = 0 + + if arg_type.strip() == 'op_arg_gbl' or arg_type.strip() == 'op_opt_arg_gbl': + maps[m] = OP_GBL + var[m] = args['data'] + dims[m] = args['dim'] + typs[m] = args['typ'][1:-1] + if arg_type.strip() == 'op_opt_arg_gbl': + optflags[m] = 1 + else: + optflags[m] = 0 + + l = -1 + for l in range(0,len(OP_accs_labels)): + if args['acc'].strip() == OP_accs_labels[l].strip(): + break + + if l == -1: + print('unknown access type for argument '+str(m)) + else: + accs[m] = l+1 + + if (maps[m]==OP_GBL) and (accs[m]==OP_WRITE or accs[m]==OP_RW): + print('invalid access type for argument '+str(m)) + + if (maps[m]!=OP_GBL) and (accs[m]==OP_MIN or accs[m]==OP_MAX): + print('invalid access type for argument '+str(m)) + + print(' ') +# +# identify indirect datasets +# + ninds = 0 + inds = [0]*nargs + invinds = [0]*nargs + invmapinds = [0]*nargs + mapinds = [0]*nargs + indtyps = ['']*nargs + inddims = ['']*nargs + indaccs = [0]*nargs + + j = [i for i, x in enumerate(maps) if x == OP_MAP] + + while len(j) > 0: + + indtyps[ninds] = typs[j[0]] + inddims[ninds] = dims[j[0]] + indaccs[ninds] = accs[j[0]] + invinds[ninds] = j[0] #inverse mapping + ninds = ninds + 1 + for i in range(0,len(j)): + if var[j[0]] == var[j[i]] and typs[j[0]] == typs[j[i]] \ + and accs[j[0]] == accs[j[i]]: #same variable + inds[j[i]] = ninds + + + k = [] + for i in range(0,len(j)): + if not (var[j[0]] == var[j[i]] and typs[j[0]] == typs[j[i]] \ + and accs[j[0]] == accs[j[i]]): #same variable + k = k+[j[i]] + j = k + + if ninds > 0: + invmapinds = invinds[:] + for i in range(0,ninds): + for j in range(0,i): + if (mapnames[invinds[i]] == mapnames[invinds[j]]): + invmapinds[i] = invmapinds[j] + for i in range(0,nargs): + mapinds[i] = i + for j in range(0,i): + if (maps[i] == OP_MAP) and (mapnames[i] == mapnames[j]) and (idxs[i] == idxs[j]): + mapinds[i] = mapinds[j] +# +# check for repeats +# + repeat = False + rep1 = False + rep2 = False + + for nk in range (0,nkernels): + rep1 = kernels[nk]['name'] == name and \ + kernels[nk]['nargs'] == nargs and \ + kernels[nk]['ninds'] == ninds + if rep1: + rep2 = True + for arg in range(0,nargs): + rep2 = rep2 and kernels[nk]['dims'][arg] == dims[arg] and \ + kernels[nk]['maps'][arg] == maps[arg] and \ + kernels[nk]['typs'][arg] == typs[arg] and \ + kernels[nk]['accs'][arg] == accs[arg] and \ + kernels[nk]['idxs'][arg] == idxs[arg] and \ + kernels[nk]['soaflags'][arg] == soaflags[arg] and \ + kernels[nk]['optflags'][arg] == optflags[arg] and \ + kernels[nk]['inds'][arg] == inds[arg] + + for arg in range(0,ninds): + rep2 = rep2 and kernels[nk]['inddims'][arg] == inddims[arg] and \ + kernels[nk]['indaccs'][arg] == indaccs[arg] and \ + kernels[nk]['indtyps'][arg] == indtyps[arg] and \ + kernels[nk]['invinds'][arg] == invinds[arg] + + if rep2: + print('repeated kernel with compatible arguments: '+ kernels[nk]['name']) + repeat = True + else: + print('repeated kernel with incompatible arguments: ERROR') + sys.exit(-1) + break + +# +# output various diagnostics +# + if not repeat: + print(' local constants:', end=' ') + for arg in range(0,nargs): + if maps[arg] == OP_GBL and accs[arg] == OP_READ: + print(str(arg), end=' ') + print('\n global reductions:', end=' ') + for arg in range(0,nargs): + if maps[arg] == OP_GBL and accs[arg] != OP_READ: + print(str(arg), end=' ') + print('\n direct arguments:', end=' ') + for arg in range(0,nargs): + if maps[arg] == OP_ID: + print(str(arg), end=' ') + print('\n indirect arguments:', end=' ') + for arg in range(0,nargs): + if maps[arg] == OP_MAP: + print(str(arg), end=' ') + if ninds > 0: + print('\n number of indirect datasets: '+str(ninds), end=' ') + + print('\n') +# +# store away in master list +# + if not repeat: + nkernels = nkernels+1; + temp = {'name': name, + 'set' : set_name, + 'nargs': nargs, + 'dims': dims, + 'maps': maps, + 'var': var, + 'typs': typs, + 'accs': accs, + 'idxs': idxs, + 'inds': inds, + 'soaflags': soaflags, + 'optflags': optflags, + + 'ninds': ninds, + 'inddims': inddims, + 'indaccs': indaccs, + 'indtyps': indtyps, + 'invinds': invinds, + 'mapnames' : mapnames, + 'mapinds': mapinds, + 'invmapinds' : invmapinds } + + if hydra==1: + temp['master_file'] = src_file.split('.')[0].replace('mod_','') + temp['mod_file'] = 'external '+name + i = text[0:code_loc].rfind(temp['mod_file']) + if i < 0: + print(' ERROR: no module use statement ('+temp['mod_file']+') found! ') + if bookleaf==1: + file_part = src_file.split('/') + file_part = file_part[len(file_part)-1] + temp['master_file'] = file_part.split('.')[0] + if temp['master_file'] in name: + temp['mod_file'] = temp['master_file'] + '_kernels.f90' + else: + temp['mod_file'] = 'common_kernels.f90' + + kernels.append(temp) + +########################## output source file ############################ + + fid = open(src_file.replace('.','_op.'), 'w') +# if file_format == 90: +# fid = open(src_file.split('.')[0]+'_op.F90', 'w') +# elif file_format == 77: +# fid = open(src_file.split('.')[0]+'_op.F', 'w') + date = datetime.datetime.now() + #fid.write('!\n! auto-generated by op2_fortran.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write('!\n! auto-generated by op2_fortran.py\n!\n\n') + + loc_old = 0 + #read original file and locate header location + if bookleaf: + loc_header = [text.lower().find('use op2_bookleaf')] + else: + loc_header = [text.find('use OP2_Fortran_Reference')] + + #get locations of all op_decl_consts + n_consts = len(const_args) + loc_consts = [0]*n_consts + for n in range(0,n_consts): + loc_consts[n] = const_args[n]['loc'] + + #get locations of all op_par_loops + n_loops = len(loop_args); + loc_loops = [0]*n_loops + for n in range(0,n_loops): + loc_loops[n] = loop_args[n]['loc'] + + locs = sorted(loc_header+loc_consts+loc_loops) + + + +# +# process header, loops and constants +# + for loc in range(0,len(locs)): + fid.write(text[loc_old:locs[loc]-1]) + loc_old = locs[loc]-1 + indent = '' + ind = 0; + while 1: + if text[locs[loc]-ind] == '\n': + break + indent = indent + ' ' + ind = ind + 1 + + if locs[loc] in loc_header: + line = '' + if hydra==0: + for nk in range (0,len(kernels)): + if text.find(kernels[nk]['name']) > -1: + line = line +'\n'+' use ' + kernels[nk]['name'].upper()+'_MODULE' + line = line + '\n'+indent + + fid.write(line[2:len(line)]); + if bookleaf: + loc_old = locs[loc] # keep the original include + else: + loc_old = locs[loc]+25 + continue + + if locs[loc] in loc_consts:# stripping the op_decl_consts -- as there is no implementation required + line = '' + fid.write(line); + endofcall = text.find('\n', locs[loc]) + loc_old = endofcall+1 + continue + + if locs[loc] in loc_loops: + indent = indent + ' '*len('op_par_loop') + if file_format == 77: + indent=' ' + endofcall = arg_parse(text,locs[loc]+11) + #endofcall = text.find('\n\n', locs[loc]) + curr_loop = loc_loops.index(locs[loc]) + name = loop_args[curr_loop]['name1'] + if file_format == 90: + line = str(' '+name+'_host(&\n'+indent+'& "'+name+'",'+ + loop_args[curr_loop]['set']+', '+cont_end+'\n') + elif file_format == 77: + line = str(' '+name+'_host(\n'+indent+'& "'+name+'",'+ + loop_args[curr_loop]['set']+', '+cont_end+'\n') + + for arguments in range(0,loop_args[curr_loop]['nargs']): + elem = loop_args[curr_loop]['args'][arguments] + if elem['type'] == 'op_arg_dat': + line = line + indent + cont + elem['type'] + '(' + elem['dat'] + ','+ elem['idx'] \ + + ','+ elem['map'] + ','+ elem['dim']+ ','+ typechange(elem['typ']) +','+ elem['acc'] + elif elem['type'] == 'op_opt_arg_dat': + line = line + indent + cont + elem['type'] + '(' +elem['opt']+','+ elem['dat'] + ','+ elem['idx'] \ + + ','+ elem['map'] + ','+ elem['dim']+ ','+ typechange(elem['typ']) +','+ elem['acc'] + elif elem['type'] == 'op_arg_gbl': + line = line + indent + cont + elem['type'] + '(' + elem['data'] + ','+ elem['dim'] \ + +','+ typechange(elem['typ'])+','+ elem['acc'] + elif elem['type'] == 'op_opt_arg_gbl': + line = line + indent + cont + elem['type'] + '(' + elem['opt']+','+ elem['data'] + ','+ elem['dim'] \ + +','+ typechange(elem['typ'])+','+ elem['acc'] + + if arguments != loop_args[curr_loop]['nargs'] - 1: + line = line + '), '+cont_end+'\n' + else: + line = line + '))\n' + + fid.write(line) + + loc_old = endofcall+1 + continue + + + + fid.write(text[loc_old:]) + fid.close() + if hydra == 1 or bookleaf==1: + fid = open(src_file.replace('.','_op.'), 'r') + #if file_format == 90: + # fid = open(src_file.split('.')[0]+'_op.F90', 'r') + #elif file_format == 77: + # fid = open(src_file.split('.')[0]+'_op.F', 'r') + + + text = fid.read() + fid.close() + if hydra: + #replace = 'use OP2_FORTRAN_DECLARATIONS\n#ifdef OP2_ENABLE_CUDA\n use HYDRA_CUDA_MODULE\n#endif\n' + replace = 'use OP2_FORTRAN_DECLARATIONS\n' + text = text.replace('use OP2_FORTRAN_DECLARATIONS\n',replace) + if bookleaf: + text = text.replace('USE OP2_Fortran_Reference\n','') + text = text.replace('USE common_kernels','! USE common_kernels') + file_part = src_file.split('/') + file_part = file_part[len(file_part)-1] + master_file = file_part.split('.')[0] + text = text.replace('USE '+master_file+'_kernels','! USE USE '+master_file+'_kernels') + for nk in range (0,len(kernels)): + if hydra: + replace = 'use '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][9:]+'_module_MODULE'+'\n' + else: + replace = 'use '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][4:]+'_MODULE'+'\n' + text = text.replace(kernels[nk]['mod_file']+'\n', replace) + if hydra: + pattern = re.compile(r'(\n\s*implicit\s*none\s*)(\n\s*use .*)',re.IGNORECASE) + count = 1 + while count>0: + res = re.subn(pattern, r'\2\1', text) + text = res[0] + count = res[1] + #do nothing + # x=1 + + fid = open(src_file.replace('.','_op.'), 'w') + #if file_format == 90: + # fid = open(src_file.split('.')[0]+'_op.F90', 'w') + #elif file_format == 77: + # fid = open(src_file.split('.')[0]+'_op.F', 'w') + fid.write(text) + fid.close() + + f.close() +#end of loop over input source files + +########################## errors and warnings ############################ + +if ninit==0: + print(' ') + print('-----------------------------') + print(' ERROR: no call to op_init ') + print('-----------------------------') + +if nexit==0: + print(' ') + print('-------------------------------') + print(' WARNING: no call to op_exit ') + print('-------------------------------') + +if npart==0 and nhdf5>0: + print(' ') + print('---------------------------------------------------') + print(' WARNING: hdf5 calls without call to op_partition ') + print('---------------------------------------------------') + +########## finally, generate target-specific kernel files ################ + +#MPI+SEQ +#op2_gen_mpiseq(str(sys.argv[init_ctr]), date, consts, kernels, hydra) # generate host stubs for MPI+SEQ +op2_gen_mpiseq3(str(sys.argv[init_ctr]), date, consts, kernels, hydra, bookleaf) # generate host stubs for MPI+SEQ -- optimised by removing the overhead due to fortran c to f pointer setups +op2_gen_mpivec(str(sys.argv[init_ctr]), date, consts, kernels, hydra, bookleaf) # generate host stubs for MPI+SEQ with intel vectorization optimisations + +#OpenMP +op2_gen_openmp3(str(sys.argv[init_ctr]), date, consts, kernels, hydra, bookleaf) # optimised by removing the overhead due to fortran c to f pointer setups +#op2_gen_openmp2(str(sys.argv[init_ctr]), date, consts, kernels, hydra) # version without staging +#op2_gen_openmp(str(sys.argv[init_ctr]), date, consts, kernels, hydra) # original version - one that most op2 papers refer to + +#CUDA +#op2_gen_cuda(str(sys.argv[1]), date, consts, kernels, hydra, bookleaf) +#op2_gen_cuda_gbl(str(sys.argv[init_ctr]), date, consts, kernels, hydra,bookleaf) # global coloring +#op2_gen_cuda_permute(str(sys.argv[init_ctr]), date, consts, kernels, hydra,bookleaf) # permute does a different coloring (permute execution within blocks by color) +op2_gen_cuda_color2(str(sys.argv[init_ctr]), date, consts, kernels, hydra,bookleaf) # does global coloring +#op2_gen_cudaINC(str(sys.argv[1]), date, consts, kernels, hydra) # stages increment data only in shared memory +#op2_gen_cuda_old(str(sys.argv[1]), date, consts, kernels, hydra) # Code generator targettign Fermi GPUs + +#OpenACC +#op2_gen_openacc(str(sys.argv[init_ctr]), date, consts, kernels, hydra, bookleaf) # optimised by removing the overhead due to fortran c to f pointer setups + +#OpenMP4 offload +op2_gen_openmp4(str(sys.argv[init_ctr]), date, consts, kernels, hydra, bookleaf) # optimised by removing the overhead due to fortran c to f pointer setups + +#if hydra: +# op2_gen_cuda_hydra() #includes several Hydra specific features + +########################################################################## +# ** END MAIN APPLICATION ** +########################################################################## diff --git a/translator/fortran/op2_gen_cuda.py b/translator/fortran/op2_gen_cuda.py new file mode 100644 index 000000000..ec686d0ef --- /dev/null +++ b/translator/fortran/op2_gen_cuda.py @@ -0,0 +1,1802 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +def arg_parse(text,j): + + depth = 0 + loc2 = j; + while 1: + if text[loc2] == '(': + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def op2_gen_cuda(master, date, consts, kernels, hydra, bookleaf): + + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + header_text = '' + body_text = '' + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + + +# +# set two logicals +# + j = -1 + ind_rw = 0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + if maps[i] == OP_MAP and accs[i] == OP_RW: + ind_rw = 1 + ind_inc = j >= 0 + + j = -1 + reduct_mdim = 0 + reduct_1dim = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and (accs[i] == OP_INC or accs[i] == OP_MAX or accs[i] == OP_MIN): + j = i + if (not dims[i].isdigit()) or int(dims[i])>1: + reduct_mdim = 1 + if (accs[i] == OP_MAX or accs[i] == OP_MIN): + print('ERROR: Multidimensional MIN/MAX reduction not yet implemented') + else: + reduct_1dim = 1 + if maps[i] == OP_GBL and accs[i] == OP_WRITE: + j = i + reduct = j >= 0 + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + code('USE HYDRA_CUDA_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + if not bookleaf: + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if (accs[g_m]== OP_INC or accs[g_m]== OP_MIN or accs[g_m]== OP_MAX): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)+name) + if ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opGblDat'+str(g_m+1)+'Device'+name) + + + code('') + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + code('') + if is_soa > -1: + code('#include "op2_macros.h"') + code('') + code('CONTAINS') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct_1dim: + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + code('attributes (device) SUBROUTINE ReductionInt4(reductionResult,inputValue,reductionOperation)') + code('INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult') + code('INTEGER(kind=4) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: sharedInt4') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedInt4(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1)') + code('CASE (1)') + IF('sharedInt4(threadID + i1) < sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(threadID + i1) > sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedInt4(0)') + code('CASE (1)') + IF('sharedInt4(0) < reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(0) > reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + if reduct_mdim: + comm('Multidimensional reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8Mdim(reductionResult,inputValue,reductionOperation,dim)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(:) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), VALUE :: dim') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: d') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2)') + ENDDO() + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1)') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + + +########################################################################## +# Inline user kernel function +########################################################################## + if hydra: + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','attributes(host) subroutine') + text = text.replace('subroutine '+name, 'subroutine '+name) + file_text += text + code('') + code('') + i = text.find('const2.inc') + if i > -1: + fi2 = open("hydra_constants_list.txt","r") + for line in fi2: + fstr = '\\b'+line[:-1]+'\\b' + rstr = line[:-1]+'_OP2CONSTANT' + text = re.sub(fstr,rstr,text) + text = text.replace('#include "const2.inc"','!#include "const2.inc"') + text = text.replace('attributes(host) subroutine','attributes(device) subroutine') + text = text.replace('subroutine '+name, 'subroutine '+name+'_gpu') + text = text.replace('use BCS_KERNELS', '!use BCS_KERNELS') + text = text.replace('use REALGAS_KERNELS', '!use REALGAS_KERNELS') + text = text.replace('use UPDATE_KERNELS', '!use UPDATE_KERNELS') + if ('BCFLUXK' in name) or ('INVISCBNDS' in name): + code('#include "../../bcs_kernels_gpufun.inc"') + kern_names = ['QRG_SET','OUTFLOW_FS','FREESTREAM','INFLOW','MP_INFLOW_CHAR','MP_OUTFLOW_CHAR','OUTFLOW','INFLOW_WHIRL','UNIQUE_INC','FILM_INJ','WFLUX','FFLUX'] + for i in range(0,12): + text = text.replace('call '+kern_names[i]+'(', 'call '+kern_names[i]+'_gpu(') + text = text.replace('call '+kern_names[i].lower()+'(', 'call '+kern_names[i].lower()+'_gpu(') + text = text.replace('CALL '+kern_names[i]+'(', 'CALL '+kern_names[i]+'_gpu(') + if 'call LOW' in text: + kern_names = ['LOW','LOWH','LOWK'] + for i in range(0,3): + text = text.replace('call '+kern_names[i]+'(', 'call '+kern_names[i]+'_gpu(') + text = text.replace('CALL '+kern_names[i]+'(', 'CALL '+kern_names[i]+'_gpu(') + code('#include "../../flux_low_gpufun.inc"') + if ('INVJACS' in name): + text = text.replace('call MATINV5(', 'call MATINV5_gpu(') + code('#include "../../update_kernels_gpufun.inc"') + + # + # Apply SoA to variable accesses + # + j = text.find(name+'_gpu') + endj = arg_parse(text,j) + while text[j] != '(': + j = j + 1 + arg_list = text[j+1:endj] + arg_list = arg_list.replace('&','') + varlist = ['']*nargs + leading_dim = [-1]*nargs + for g_m in range(0,nargs): + varlist[g_m] = arg_list.split(',')[g_m].strip() + for g_m in range(0,nargs): + if soaflags[g_m] and not (maps[g_m]==OP_MAP and accs[g_m]==OP_INC): + #Start looking for the variable in the code, after the function signature + loc1 = endj + p = re.compile('\\b'+varlist[g_m]+'\\b') + nmatches = len(p.findall(text[loc1:])) + for id in range(0,nmatches): + #Search for the next occurence + i = p.search(text[loc1:]) + #Skip commented out ones + j = text[:loc1+i.start()].rfind('\n') + if j > -1 and text[j:loc1+i.start()].find('!')>-1: + loc1 = loc1+i.end() + continue + + #Find closing bracket + endarg = arg_parse(text,loc1+i.start()) + #Find opening bracket + beginarg = loc1+i.start() + while text[beginarg] != '(': + beginarg = beginarg+1 + beginarg = beginarg+1 + + #If this is the first time we see the argument (i.e. its declaration) + if leading_dim[g_m] == -1: + if (len(text[beginarg:endarg].split(',')) > 1): + #if it's 2D, remember leading dimension, and make it 1D + leading_dim[g_m] = text[beginarg:endarg].split(',')[0] + text = text[:beginarg] + '*'+' '*(endarg-beginarg-1) + text[endarg:] + else: + leading_dim[g_m] = 1 + #Continue search after this instance of the variable + loc1 = endarg+1 + else: + #If we have seen this variable already, then it's in the actual code, replace it with macro + macro = 'OP2_SOA('+text[loc1+i.start():loc1+i.end()]+',' + if leading_dim[g_m] == 1: + macro = macro + text[beginarg:endarg] + else: + macro = macro + text[beginarg:endarg].split(',')[0] + '+('+text[beginarg:endarg].split(',')[1]+'-1)*'+leading_dim[g_m] + if maps[g_m] == OP_MAP: + macro = macro + ', nodes_stride_OP2CONSTANT)' + else: + macro = macro + ', ' + set_name.split('%')[-1].strip()+'_stride_OP2CONSTANT)' + text = text[:loc1+i.start()] + macro + text[endarg+1:] + #Continue search after this instance of the variable + loc1 = loc1+i.start() + len(macro) + + + + + file_text += text + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + fid = open(modfile, 'r') + text = fid.read() + i = text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += 'attributes (host) subroutine ' + name + text[i+ 11 + len(name):j]+'\n\n' + file_text += 'attributes (device) subroutine ' + name + '_gpu' + text[i+ 11 + len(name):j]+'_gpu\n\n' + else: + depth -= 2 + code('attributes (host) &') + code('#include "'+name+'.inc"') + code('attributes (device) &') + code('#include "'+name+'.inc2"') + depth += 2 + code('') + + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + if nopts >0: + code('& optflags, &') + if is_soa > -1: + code('& soa_stride, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + code('& reductionArrayDevice'+str(g_m+1)+', &') + elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opGblDat'+str(g_m+1)+'Device'+name+', &') + + if ninds > 0: #indirect loop + code('& pblkMap, &') + code('& poffset, &') + code('& pnelems, &') + code('& pnthrcol, &') + code('& pthrcol, &') + code('& setSize, &') + code('& blockOffset)') + else: #direct loop + code('& setSize)') + + code('') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if indaccs[g_m]==OP_READ: + code(typs[invinds[g_m]]+', DEVICE, INTENT(IN) :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + else: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if accs[g_m] == OP_READ: + code(typs[g_m]+', DEVICE, INTENT(IN) :: opDat'+str(g_m+1)+'Device'+name+'(*)') + else: + code(typs[g_m]+', DEVICE :: opDat'+str(g_m+1)+'Device'+name+'(*)') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + #if it's a global reduction, then we pass in a reductionArrayDevice + code(typs[g_m]+', DIMENSION(:), DEVICE :: reductionArrayDevice'+str(g_m+1)) + #and additionally we need registers to store contributions, depending on dim: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opGblDat'+str(g_m+1)+'Device'+name) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opGblDat'+str(g_m+1)+'Device'+name) + else: + #if it's not a global reduction, and multidimensional then we pass in a device array + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + if accs[g_m] == OP_READ: #if OP_READ and dim 1, we can pass in by value + code(typs[g_m]+', VALUE :: opGblDat'+str(g_m+1)+'Device'+name) + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + + if is_soa > -1: + code('INTEGER(kind=4), VALUE :: soa_stride') + + if ninds > 0: #indirect loop + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: poffset') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnelems') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE, INTENT(IN) :: pthrcol') + code('INTEGER(kind=4), VALUE :: blockOffset') + code('INTEGER(kind=4), VALUE :: setSize') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + + code('') + + code('INTEGER(kind=4), SHARED :: numOfColours') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4), SHARED :: blockID') + code('INTEGER(kind=4), SHARED :: threadBlockOffset') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreads') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('INTEGER(kind=4) :: n1') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + + else: #direct loop + code('INTEGER(kind=4), VALUE :: setSize') + code('INTEGER(kind=4) :: i1') + + if nopts > 0: + code('') + comm('optional variables') + #for indirect OP_READ, we would pass in a pointer to shared, offset by map, but if opt, then map may not exist, thus we need a separate pointer + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Opt') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Opt') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('opGblDat'+str(g_m+1)+'Device'+name+' = 0') + + code('') + if ninds > 0: + IF('threadIdx%x - 1 .EQ. 0') + code('blockID = pblkMap(blockIdx%x - 1 + blockOffset)') + code('numberOfActiveThreads = pnelems(blockID)') + code('numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)') + code('numOfColours = pnthrcol(blockID)') + code('threadBlockOffset = poffset(blockID)') + code('') + ENDIF() + + code('') + code('CALL syncthreads()') + code('') + code('i1 = threadIdx%x - 1') + code('') + + + DOWHILE('i1 < numberOfActiveThreadsCeiling') + if ind_inc or ind_rw: + code('colour2 = -1') + #-----Begin Indirect RW handling----- + if ind_rw: + DO('colour1','0','numOfColours') + IF('i1 < numberOfActiveThreads') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + code('colour2 = pthrcol(i1 + threadBlockOffset)') + IF('colour2 .EQ. colour1') + #-----End Indirect RW handling----- + else: + IF('i1 < numberOfActiveThreads') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Local = 0') + else: + DO('i2','0',dims[g_m]) + code('opDat'+str(g_m+1)+'Local(i2) = 0') + ENDDO() + + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','0', dims[g_m]) + if soaflags[g_m] == 1: + code('opDat'+str(g_m+1)+'Opt(i2) = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 * soa_stride + map'+str(mapinds[g_m]+1)+'idx)') + else: + code('opDat'+str(g_m+1)+'Opt(i2) = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'))') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Opt = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)') + ENDIF() + + code('') + comm('kernel call') + + else: + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','setSize','blockDim%x * gridDim%x') + code('') + comm('kernel call') + code('') + +########################################################################## +# CUDA kernel call +########################################################################## + if ninds > 0: #indirect kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC) and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + (i1 + threadBlockOffset))' + elif maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i1 + threadBlockOffset) * ('+dims[g_m]+') +1' + \ + ':(i1 + threadBlockOffset) * ('+dims[g_m]+') + ('+dims[g_m]+'))' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i1 + threadBlockOffset) * ('+dims[g_m]+') +1)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'):'+ \ + ' map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==1: + line = line +indent + '& opDat'+str(g_m+1)+'Opt' + elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + #write optional/SoA arguments back from registers + for g_m in range(0,nargs): + if (accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Opt(i2)') + ENDDO() + else: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(g_m+1)+'Opt') + ENDIF() + + if ind_inc and not ind_rw: + code('colour2 = pthrcol(i1 + threadBlockOffset)') + if not ind_rw: + ENDIF() + + if ind_inc or ind_rw: + if ind_inc and not ind_rw: + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,nargs): + if optflags[g_m]==1 and maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local') + else: + if soaflags[g_m] == 1: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*soa_stride + map'+str(mapinds[g_m]+1)+'idx) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*soa_stride + map'+str(mapinds[g_m]+1)+'idx) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx* ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + if optflags[g_m]!=1: + code('') + if optflags[g_m]==1 and maps[g_m]==OP_MAP and (accs[g_m] == OP_INC): + ENDIF() + code('') + ENDIF() + if ind_rw: + ENDIF() + code('CALL syncthreads()') + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + code('') + + else: #direct kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC) and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i1)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 * ('+dims[g_m]+') + 1: i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if 'real' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionFloat8(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0)') + else: + code('CALL ReductionFloat8Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0,'+dims[g_m]+')') + elif 'integer' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionInt4(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0)') + else: + code('CALL ReductionInt4Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0,'+dims[g_m]+')') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CPU hust stub +########################################################################## + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + IF('getHybridGPU()') + code('CALL '+name+'_host_gpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ELSE() + code('CALL '+name+'_host_cpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ENDIF() + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for GPU execution') + code('') + code('attributes (host) SUBROUTINE '+name+'_host_gpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(invinds[g_m]+1)+'Device'+name) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: opMap'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(g_m+1)+'Device'+name) + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: mappingArray'+str(invinds[g_m]+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: pblkMapSize') + code('INTEGER(kind=4) :: poffsetSize') + code('INTEGER(kind=4) :: pnelemsSize') + code('INTEGER(kind=4) :: pnthrcolSize') + code('INTEGER(kind=4) :: pthrcolSize') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pblkMap') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: poffset') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pnelems') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pthrcol') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + + code('INTEGER(kind=4) :: istat') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code(typs[g_m]+', DIMENSION(:), POINTER :: opDat'+str(g_m+1)+'Host') + else: + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + else: + code('') + code('blocksPerGrid = 200') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opMap'+str(invinds[g_m]+1)+'Cardinality = set%setPtr%size * getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + code('') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data_d,opMap'+str(invinds[g_m]+1)+'Device'+name+',(/opMap'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m])>1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host,(/opDat'+str(g_m+1)+'Cardinality/))') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + code('') + + if ninds > 0: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk,(/set%setPtr%size/))') + code('pblkMapSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap_d,pblkMap,(/pblkMapSize/))') + code('poffsetSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%offset_d,poffset,(/poffsetSize/))') + code('pnelemsSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems_d,pnelems,(/pnelemsSize/))') + code('pnthrcolSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,pnthrcol,(/pnthrcolSize/))') + code('pthrcolSize = set%setPtr%size') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,pthrcol,(/pthrcolSize/))') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + IF('.not. allocated(opGblDat'+str(g_m+1)+'Device'+name+')') + code('allocate(opGblDat'+str(g_m+1)+'Device'+name+'(opArg'+str(g_m+1)+'%dim))') + ENDIF() + code('opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim) = opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim)') + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + IF ('.not. allocated(reductionArrayDevice'+str(g_m+1)+name+')') + code('allocate( reductionArrayDevice'+str(g_m+1)+name+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + ENDIF() + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.0') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = 0.0') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+name+' = reductionArrayHost'+str(g_m+1)+'') + + code('') + + #indirect loop host stub call + if ninds > 0: + code('blockOffset = 0') + code('') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + + DO('i2','0','actualPlan_'+name+'%ncolors') + IF('i2 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('blocksPerGrid = ncolblk(i2 + 1)') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + code('CALL op_cuda_'+name+' <<>> (&') + if nopts>0: + code('& optflags, &') + if is_soa > -1: + code('& getSetSizeFromOpArg(opArg'+str(is_soa+1)+'), &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opMap'+str(invinds[inds[g_m]-1]+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('reductionArrayDevice'+str(g_m+1)+name+', &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + code('& pblkMap, &') + code('& poffset,pnelems,pnthrcol,pthrcol,set%setPtr%size+set%setPtr%exec_size, blockOffset)') + code('') + code('blockOffset = blockOffset + blocksPerGrid') + ENDDO() + code('') + else: #direct loop host stub call + code('CALL op_cuda_'+name+' <<>>( &') + if nopts>0: + code('& optflags, &') + if is_soa > -1: + code('& getSetSizeFromOpArg(opArg'+str(is_soa+1)+'), &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('reductionArrayDevice'+str(g_m+1)+name+', &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + code('set%setPtr%size)') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. set%setPtr%core_size)') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('') + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim) = opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim)') + + if reduct: + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+name+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = opDat'+str(g_m+1)+'Host + reductionArrayHost'+str(g_m+1)+'(i10+1)') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') + reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+'))') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') +# code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + + code('istat = cudaDeviceSynchronize()') + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for CPU execution') + code('') +########################################################################## +# Generate OpenMP host stub +########################################################################## +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host_cpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + if ninds > 0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + code_pre('#endif') + + code('') + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i1','1','numberOfThreads+1') + DO('i2','1',dims[g_m]+'+1') + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = 0') + ENDDO() + ENDDO() + + code('') + + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + line = '' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + line = line + ', opDat'+str(g_m+1)+'OptPtr' + code('!$OMP PARALLEL DO private (threadID, blockID, nelem, offset_b'+line+')') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('blockID = blkmap_'+name+'(i2+blockOffset+1)') + code('nelem = nelems_'+name+'(blockID+1)') + code('offset_b = offset_'+name+'(blockID+1)') + + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& offset_b, offset_b+nelem)') + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + else: + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + comm('kernel call') + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, sliceEnd)') + ENDDO() + code('!$OMP END PARALLEL DO') + + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i1','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','1',dims[g_m]+'+1') + code('opDat'+str(g_m+1)+'Local(i2) = opDat'+str(g_m+1)+'Local(i2) + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2)') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_kernel.CUF','w') + elif bookleaf: + fid = open(name+'_gpukernel.CUF','w') + else: + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() + +########################################################################## +# Assemble Hydra master file +########################################################################## +def op2_gen_cuda_hydra(): + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + file_text = '' + code('MODULE HYDRA_CUDA_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + comm('Constant declarations') + code('#include "hydra_constants.inc"') + code('') + comm('Loop-specific global variables') + file_text += header_text + + code('') + code('CONTAINS') + code('') + code('#include "hydra_constants_set.inc"') + code('#include "flux_low_gpufun.inc"') + code('#include "bcs_kernels_gpufun.inc"') + code('#include "update_kernels_gpufun.inc"') + + file_text += body_text + code('END MODULE') + fid = open('hydra_kernels.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_cudaINC.py b/translator/fortran/op2_gen_cudaINC.py new file mode 100644 index 000000000..6038c2e47 --- /dev/null +++ b/translator/fortran/op2_gen_cudaINC.py @@ -0,0 +1,1969 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +def arg_parse(text,j): + + depth = 0 + loc2 = j; + while 1: + if text[loc2] == '(': + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def op2_gen_cudaINC(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + header_text = '' + body_text = '' + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + + +# +# set two logicals +# + j = -1 + ind_rw = 0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + if maps[i] == OP_MAP and accs[i] == OP_RW: + ind_rw = 1 + ind_inc = j >= 0 + + j = -1 + reduct_mdim = 0 + reduct_1dim = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and (accs[i] == OP_INC or accs[i] == OP_MAX or accs[i] == OP_MIN): + j = i + if (not dims[i].isdigit()) or int(dims[i])>1: + reduct_mdim = 1 + if (accs[i] == OP_MAX or accs[i] == OP_MIN): + print('ERROR: Multidimensional MIN/MAX reduction not yet implemented') + else: + reduct_1dim = 1 + if maps[i] == OP_GBL and accs[i] == OP_WRITE: + j = i + reduct = j >= 0 + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + ninds_staged = 0 + inds_staged = [-1]*nargs + for i in range(0,nargs): + if maps[i]==OP_MAP and accs[i]==OP_INC: + if inds_staged[invinds[inds[i]-1]] == -1: + inds_staged[i] = ninds_staged + ninds_staged = ninds_staged + 1 + else: + inds_staged[i] = inds_staged[invinds[inds[i]-1]] + invinds_staged = [-1]*ninds_staged + inddims_staged = [-1]*ninds_staged + indopts_staged = [-1]*ninds_staged + for i in range(0,nargs): + if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: + invinds_staged[inds_staged[i]] = i + inddims_staged[inds_staged[i]] = dims[i] + if optflags[i] == 1: + indopts_staged[inds_staged[i]] = i + for i in range(0,nargs): + inds_staged[i] = inds_staged[i] + 1 + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + code('USE HYDRA_CUDA_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if (accs[g_m]== OP_INC or accs[g_m]== OP_MIN or accs[g_m]== OP_MAX): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)+name) + if ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opGblDat'+str(g_m+1)+'Device'+name) + + + code('') + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + code('') + if is_soa > -1: + code('#include "op2_macros.h"') + code('') + code('CONTAINS') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct_1dim: + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + code('attributes (device) SUBROUTINE ReductionInt4(reductionResult,inputValue,reductionOperation)') + code('INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult') + code('INTEGER(kind=4) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: sharedInt4') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedInt4(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1)') + code('CASE (1)') + IF('sharedInt4(threadID + i1) < sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(threadID + i1) > sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedInt4(0)') + code('CASE (1)') + IF('sharedInt4(0) < reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(0) > reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + if reduct_mdim: + comm('Multidimensional reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8Mdim(reductionResult,inputValue,reductionOperation,dim)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(:) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), VALUE :: dim') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: d') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2)') + ENDDO() + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1)') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + + +########################################################################## +# Inline user kernel function +########################################################################## + if hydra: + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','attributes(host) subroutine') + text = text.replace('subroutine '+name, 'subroutine '+name) + file_text += text + code('') + code('') + i = text.find('const2.inc') + if i > -1: + fi2 = open("hydra_constants_list.txt","r") + for line in fi2: + fstr = '\\b'+line[:-1]+'\\b' + rstr = line[:-1]+'_OP2CONSTANT' + text = re.sub(fstr,rstr,text) + text = text.replace('#include "const2.inc"','!#include "const2.inc"') + text = text.replace('attributes(host) subroutine','attributes(device) subroutine') + text = text.replace('subroutine '+name, 'subroutine '+name+'_gpu') + text = text.replace('use BCS_KERNELS', '!use BCS_KERNELS') + text = text.replace('use REALGAS_KERNELS', '!use REALGAS_KERNELS') + text = text.replace('use UPDATE_KERNELS', '!use UPDATE_KERNELS') + if ('BCFLUXK' in name) or ('INVISCBNDS' in name): + code('#include "../../bcs_kernels_gpufun.inc"') + kern_names = ['QRG_SET','OUTFLOW_FS','FREESTREAM','INFLOW','MP_INFLOW_CHAR','MP_OUTFLOW_CHAR','OUTFLOW','INFLOW_WHIRL','UNIQUE_INC','FILM_INJ','WFLUX','FFLUX'] + for i in range(0,12): + text = text.replace('call '+kern_names[i]+'(', 'call '+kern_names[i]+'_gpu(') + text = text.replace('call '+kern_names[i].lower()+'(', 'call '+kern_names[i].lower()+'_gpu(') + text = text.replace('CALL '+kern_names[i]+'(', 'CALL '+kern_names[i]+'_gpu(') + if 'call LOW' in text: + kern_names = ['LOW','LOWH','LOWK'] + for i in range(0,3): + text = text.replace('call '+kern_names[i]+'(', 'call '+kern_names[i]+'_gpu(') + text = text.replace('CALL '+kern_names[i]+'(', 'CALL '+kern_names[i]+'_gpu(') + code('#include "../../flux_low_gpufun.inc"') + if ('INVJACS' in name): + text = text.replace('call MATINV5(', 'call MATINV5_gpu(') + code('#include "../../update_kernels_gpufun.inc"') + + j = text.find(name+'_gpu') + endj = arg_parse(text,j) + while text[j] != '(': + j = j + 1 + arg_list = text[j+1:endj] + arg_list = arg_list.replace('&','') + varlist = ['']*nargs + leading_dim = [-1]*nargs + for g_m in range(0,nargs): + varlist[g_m] = arg_list.split(',')[g_m].strip() + for g_m in range(0,nargs): + if soaflags[g_m] and not (maps[g_m]==OP_MAP and accs[g_m]==OP_INC): + #Start looking for the variable in the code, after the function signature + loc1 = endj + p = re.compile('\\b'+varlist[g_m]+'\\b') + nmatches = len(p.findall(text[loc1:])) + for id in range(0,nmatches): + #Search for the next occurence + i = p.search(text[loc1:]) + #Skip commented out ones + j = text[:loc1+i.start()].rfind('\n') + if j > -1 and text[j:loc1+i.start()].find('!')>-1: + loc1 = loc1+i.end() + continue + + #Find closing bracket + endarg = arg_parse(text,loc1+i.start()) + #Find opening bracket + beginarg = loc1+i.start() + while text[beginarg] != '(': + beginarg = beginarg+1 + beginarg = beginarg+1 + + #If this is the first time we see the argument (i.e. its declaration) + if leading_dim[g_m] == -1: + if (len(text[beginarg:endarg].split(',')) > 1): + #if it's 2D, remember leading dimension, and make it 1D + leading_dim[g_m] = text[beginarg:endarg].split(',')[0] + text = text[:beginarg] + '*'+' '*(endarg-beginarg-1) + text[endarg:] + else: + leading_dim[g_m] = 1 + #Continue search after this instance of the variable + loc1 = endarg+1 + else: + #If we have seen this variable already, then it's in the actual code, replace it with macro + macro = 'OP2_SOA('+text[loc1+i.start():loc1+i.end()]+',' + if leading_dim[g_m] == 1: + macro = macro + text[beginarg:endarg] + else: + macro = macro + text[beginarg:endarg].split(',')[0] + '+('+text[beginarg:endarg].split(',')[1]+'-1)*'+leading_dim[g_m] + if maps[g_m] == OP_MAP: + macro = macro + ', nodes_stride_OP2CONSTANT)' + else: + macro = macro + ', ' + set_name.split('%')[-1].strip()+'_stride_OP2CONSTANT)' + text = text[:loc1+i.start()] + macro + text[endarg+1:] + #Continue search after this instance of the variable + loc1 = loc1+i.start() + len(macro) + + + + + file_text += text + + else: + depth -= 2 + code('attributes (host) &') + code('#include "'+name+'.inc"') + code('attributes (device) &') + code('#include "'+name+'.inc2"') + depth += 2 + code('') + + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + if nopts >0: + code('& optflags, &') + if is_soa > -1: + code('& soa_stride, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + code('& reductionArrayDevice'+str(g_m+1)+', &') + elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opGblDat'+str(g_m+1)+'Device'+name+', &') + + if ninds > 0: #indirect loop + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + + code('& ind_sizes, &') + code('& ind_offs, &') + code('& pblkMap, &') + code('& poffset, &') + code('& pnelems, &') + code('& pnthrcol, &') + code('& pthrcol, &') + code('& setSize, &') + code('& blockOffset)') + else: #direct loop + code('& setSize)') + + code('') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if indaccs[g_m]==OP_READ: + code(typs[invinds[g_m]]+', DEVICE, INTENT(IN) :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + else: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), DEVICE :: opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if accs[g_m] == OP_READ: + code(typs[g_m]+', DEVICE, INTENT(IN) :: opDat'+str(g_m+1)+'Device'+name+'(*)') + else: + code(typs[g_m]+', DEVICE :: opDat'+str(g_m+1)+'Device'+name+'(*)') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + #if it's a global reduction, then we pass in a reductionArrayDevice + code(typs[g_m]+', DIMENSION(:), DEVICE :: reductionArrayDevice'+str(g_m+1)) + #and additionally we need registers to store contributions, depending on dim: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opGblDat'+str(g_m+1)+'Device'+name) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opGblDat'+str(g_m+1)+'Device'+name) + else: + #if it's not a global reduction, and multidimensional then we pass in a device array + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + if accs[g_m] == OP_READ: #if OP_READ and dim 1, we can pass in by value + code(typs[g_m]+', VALUE :: opGblDat'+str(g_m+1)+'Device'+name) + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + + if is_soa > -1: + code('INTEGER(kind=4), VALUE :: soa_stride') + + if ninds > 0: #indirect loop + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_maps'+str(invinds_staged[g_m]+1)) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), DIMENSION(0:*), DEVICE :: mappingArray'+str(g_m+1)) + code('') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_sizes') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_offs') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: poffset') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnelems') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pthrcol') + code('INTEGER(kind=4), VALUE :: blockOffset') + code('INTEGER(kind=4), VALUE :: setSize') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if inds_staged[g_m] > 0: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'SharedMap') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + + code('') + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: sharedInt8') + code('') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), SHARED :: ind_maps'+str(invinds_staged[g_m]+1)+'offset') + code('INTEGER(kind=4), SHARED :: ind_maps'+str(invinds_staged[g_m]+1)+'size') + + code('INTEGER(kind=4), SHARED :: numOfColours') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4), SHARED :: blockID') + code('INTEGER(kind=4), SHARED :: threadBlockOffset') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreads') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('INTEGER(kind=4) :: n1') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4) :: opDat'+str(invinds_staged[g_m]+1)+'nBytes') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), SHARED :: opDat'+str(invinds_staged[g_m]+1)+'RoundUp') + if ninds_staged > 0: + code('INTEGER(kind=4) moduloResult') + + else: #direct loop + code('INTEGER(kind=4), VALUE :: setSize') + code('INTEGER(kind=4) :: i1') + + if nopts > 0: + code('') + comm('optional variables') + #for indirect OP_READ, we would pass in a pointer to shared, offset by map, but if opt, then map may not exist, thus we need a separate pointer + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Opt') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Opt') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('opGblDat'+str(g_m+1)+'Device'+name+' = 0') + + code('') + if ninds > 0: + IF('threadIdx%x - 1 .EQ. 0') + code('blockID = pblkMap(blockIdx%x - 1 + blockOffset)') + code('numberOfActiveThreads = pnelems(blockID)') + code('numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)') + code('numOfColours = pnthrcol(blockID)') + code('threadBlockOffset = poffset(blockID)') + for g_m in range(0,ninds_staged): + code('ind_maps'+str(invinds_staged[g_m]+1)+'offset = ind_offs ('+str(g_m)+' + blockID * '+str(ninds_staged)+')') + code('ind_maps'+str(invinds_staged[g_m]+1)+'size = ind_sizes('+str(g_m)+' + blockID * '+str(ninds_staged)+')') + for g_m in range(0,ninds_staged): + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = ind_maps'+str(invinds_staged[g_m]+1)+'size * ('+inddims_staged[g_m]+')') + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = opDat'+str(invinds_staged[g_m]+1)+'RoundUp + MOD(opDat'+str(invinds_staged[g_m]+1)+'RoundUp,2)') + ENDIF() + + code('') + code('CALL syncthreads()') + code('') + for g_m in range(0,ninds_staged): + if g_m>0 and indopts_staged[g_m-1] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m-1]])+')') + if g_m == 0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + prev_size = 0 + if 'real' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 8 + elif 'integer' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 4 + this_size = 0 + if 'real' in typs[invinds_staged[g_m]].lower(): + this_size = 8 + elif 'integer' in typs[invinds_staged[g_m]].lower(): + this_size = 4 + if this_size == 0 or prev_size == 0: + print("ERROR: Unrecognized type") + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)+' + opDat'+str(invinds_staged[g_m-1]+1)+'RoundUp * '+str(prev_size)+' / '+str(this_size)) + if g_m>0 and indopts_staged[g_m-1] > 0: + ELSE() + if g_m==0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)) + ENDIF() + + code('') + for g_m in range(0,ninds_staged): + code('') + code('i1 = threadIdx%x - 1') + if indopts_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m]])+')') + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size') + DO('i2','0', inddims_staged[g_m]) + if accs[invinds_staged[g_m]] == OP_READ or accs[invinds_staged[g_m]] == OP_RW or accs[invinds_staged[g_m]] == OP_WRITE: + if soaflags[invinds_staged[g_m]] == 1: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * soa_stride + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1))') + else: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1) * ('+inddims_staged[g_m]+'))') + elif accs[invinds_staged[g_m]] == OP_INC: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = 0') + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + if indopts_staged[g_m] > 0: + ENDIF() + code('') + code('') + code('CALL syncthreads()') + code('i1 = threadIdx%x - 1') + code('') + + + DOWHILE('i1 < numberOfActiveThreadsCeiling') + if ind_inc or ind_rw: + code('colour2 = -1') + #-----Begin Indirect RW handling----- + if ind_rw: + DO('colour1','0','numOfColours') + IF('i1 < numberOfActiveThreads') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + code('colour2 = pthrcol(i1 + threadBlockOffset)') + IF('colour2 .EQ. colour1') + #-----End Indirect RW handling----- + else: + IF('i1 < numberOfActiveThreads') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Local = 0') + else: + DO('i2','0',dims[g_m]) + code('opDat'+str(g_m+1)+'Local(i2) = 0') + ENDDO() + + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','0', dims[g_m]) + if soaflags[g_m] == 1: + code('opDat'+str(g_m+1)+'Opt(i2) = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 * soa_stride + map'+str(mapinds[g_m]+1)+'idx)') + else: + code('opDat'+str(g_m+1)+'Opt(i2) = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'))') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Opt = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)') + ENDIF() + + code('') + comm('kernel call') + + else: + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','setSize','blockDim%x * gridDim%x') + code('') + comm('kernel call') + code('') + +########################################################################## +# CUDA kernel call +########################################################################## + if ninds > 0: #indirect kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC) and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + (i1 + threadBlockOffset))' + elif maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i1 + threadBlockOffset) * ('+dims[g_m]+') +1' + \ + ':(i1 + threadBlockOffset) * ('+dims[g_m]+') + ('+dims[g_m]+'))' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i1 + threadBlockOffset) * ('+dims[g_m]+') +1)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'):'+ \ + ' map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==1: + line = line +indent + '& opDat'+str(g_m+1)+'Opt' + elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + #write optional/SoA arguments back from registers + for g_m in range(0,nargs): + if (accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Opt(i2)') + ENDDO() + else: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(g_m+1)+'Opt') + ENDIF() + + if ind_inc and not ind_rw: + code('colour2 = pthrcol(i1 + threadBlockOffset)') + if not ind_rw: + ENDIF() + + if ind_inc or ind_rw: + if ind_inc and not ind_rw: + code('') + if ninds_staged > 0: + IF('colour2 .GE. 0') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0 and accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'SharedMap = mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset)') + if ninds_staged > 0: + ENDIF() + code('') + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,nargs): + if optflags[g_m]==1 and maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if inds_staged[g_m] == 0: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local') + else: + if soaflags[g_m] == 1: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*soa_stride + map'+str(mapinds[g_m]+1)+'idx) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*soa_stride + map'+str(mapinds[g_m]+1)+'idx) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx* ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + (i2 + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) = &') + code('& sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + (i2 + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + if optflags[g_m]!=1: + code('') + if optflags[g_m]==1 and maps[g_m]==OP_MAP and (accs[g_m] == OP_INC): + ENDIF() + code('') + ENDIF() + if ind_rw: + ENDIF() + code('CALL syncthreads()') + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + code('') + for g_m in range(0,ninds_staged): + if accs[invinds_staged[g_m]] == OP_INC: + if indopts_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m-1]])+')') + code('i1 = threadIdx%x - 1') + if soaflags[invinds_staged[g_m]] == 1: + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size') + DO('i2','0', inddims_staged[g_m]) + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * soa_stride + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * soa_stride + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) + &') + code('& sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + '))') + ENDDO() + else: + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size * ('+inddims_staged[g_m]+')') + code('moduloResult = mod(i1,'+inddims_staged[g_m]+')') + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds_staged[g_m]+1)+' &') + code('& (ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1 / ('+inddims_staged[g_m]+')) * ('+inddims_staged[g_m]+') + 1) = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds_staged[g_m]+1)+' &') + code('& (ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1 / ('+inddims_staged[g_m]+')) * ('+inddims_staged[g_m]+') + 1) + &') + code('& sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i1)') + code('i1 = i1 + blockDim%x') + ENDDO() + if indopts_staged[g_m] > 0: + ENDIF() + else: #direct kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC) and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i1)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 * ('+dims[g_m]+') + 1: i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if 'real' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionFloat8(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0)') + else: + code('CALL ReductionFloat8Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0,'+dims[g_m]+')') + elif 'integer' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionInt4(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0)') + else: + code('CALL ReductionInt4Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:),opGblDat'+str(g_m+1)+'Device'+name+',0,'+dims[g_m]+')') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CPU hust stub +########################################################################## + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + IF('getHybridGPU()') + code('CALL '+name+'_host_gpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ELSE() + code('CALL '+name+'_host_cpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ENDIF() + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for GPU execution') + code('') + code('attributes (host) SUBROUTINE '+name+'_host_gpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opDat'+str(invinds[g_m]+1)+'Device'+name) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: opMap'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opDat'+str(g_m+1)+'Device'+name) + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('TYPE ( c_devptr ), POINTER, DIMENSION(:) :: mappingArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: mappingArray'+str(invinds[g_m]+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), DEVICE, ALLOCATABLE, DIMENSION(:) :: ind_maps'+str(invinds_staged[g_m]+1)) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), DEVICE, ALLOCATABLE, DIMENSION(:) :: mappingArray'+str(g_m+1)) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: ind_offs') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: ind_sizes') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: poffset') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pnelems') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pthrcol') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: pnindirect') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nsharedCol') + code('TYPE ( c_devptr ), POINTER, DIMENSION(:) :: ind_maps') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + + code('INTEGER(kind=4) :: istat') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code(typs[g_m]+', DIMENSION(:), POINTER :: opDat'+str(g_m+1)+'Host') + else: + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray, 1)') + code('') + else: + code('') + code('blocksPerGrid = 200') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opMap'+str(invinds[g_m]+1)+'Cardinality = set%setPtr%size * getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + code('') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data_d,opMap'+str(invinds[g_m]+1)+'Device'+name+',(/opMap'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m])>1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host,(/opDat'+str(g_m+1)+'Cardinality/))') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + code('') + + if ninds > 0: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk,(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nsharedCol,nsharedCol,(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nindirect,pnindirect,(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap_d,pblkMap,(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset_d,poffset,(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems_d,pnelems,(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,pnthrcol,(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,pthrcol,(/set%setPtr%size/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_maps,ind_maps,(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%maps,mappingArray,(/numberOfOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_sizes,ind_sizes,(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_offs,ind_offs,(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('') + for g_m in range(0,ninds_staged): + code('CALL c_f_pointer(ind_maps('+str(g_m+1)+'),ind_maps'+str(invinds_staged[g_m]+1)+',(/pnindirect('+str(g_m+1)+')/))') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('CALL c_f_pointer(mappingArray('+str(g_m+1)+'),mappingArray'+str(g_m+1)+',(/set%setPtr%size/))') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + IF('.not. allocated(opGblDat'+str(g_m+1)+'Device'+name+')') + code('allocate(opGblDat'+str(g_m+1)+'Device'+name+'(opArg'+str(g_m+1)+'%dim))') + ENDIF() + code('opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim) = opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim)') + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + IF ('.not. allocated(reductionArrayDevice'+str(g_m+1)+name+')') + code('allocate( reductionArrayDevice'+str(g_m+1)+name+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + ENDIF() + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.0') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = 0.0') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+name+' = reductionArrayHost'+str(g_m+1)+'') + + code('') + + #indirect loop host stub call + if ninds > 0: + code('blockOffset = 0') + code('') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + + DO('i2','0','actualPlan_'+name+'%ncolors') + IF('i2 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('blocksPerGrid = ncolblk(i2 + 1)') + code('dynamicSharedMemorySize = MAX(reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock, nsharedCol(1+i2))') + code('') + code('CALL op_cuda_'+name+' <<>> (&') + if nopts>0: + code('& optflags, &') + if is_soa > -1: + code('& getSetSizeFromOpArg(opArg'+str(is_soa+1)+'), &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opMap'+str(invinds[inds[g_m]-1]+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('reductionArrayDevice'+str(g_m+1)+name+', &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + + code('& ind_sizes, &') + code('& ind_offs, &') + code('& pblkMap, &') + code('& poffset,pnelems,pnthrcol,pthrcol,set%setPtr%size+set%setPtr%exec_size, blockOffset)') + code('') + code('blockOffset = blockOffset + blocksPerGrid') + ENDDO() + code('') + else: #direct loop host stub call + code('CALL op_cuda_'+name+' <<>>( &') + if nopts>0: + code('& optflags, &') + if is_soa > -1: + code('& getSetSizeFromOpArg(opArg'+str(is_soa+1)+'), &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('reductionArrayDevice'+str(g_m+1)+name+', &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + code('set%setPtr%size)') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. set%setPtr%core_size)') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim) = opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim)') + + if reduct: + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+name+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = opDat'+str(g_m+1)+'Host + reductionArrayHost'+str(g_m+1)+'(i10+1)') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') + reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+'))') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') +# code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('istat = cudaDeviceSynchronize()') + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for CPU execution') + code('') +########################################################################## +# Generate OpenMP host stub +########################################################################## +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host_cpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + if ninds > 0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + code_pre('#endif') + + code('') + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i1','1','numberOfThreads+1') + DO('i2','1',dims[g_m]+'+1') + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = 0') + ENDDO() + ENDDO() + + code('') + + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + line = '' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + line = line + ', opDat'+str(g_m+1)+'OptPtr' + code('!$OMP PARALLEL DO private (threadID, blockID, nelem, offset_b'+line+')') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('blockID = blkmap_'+name+'(i2+blockOffset+1)') + code('nelem = nelems_'+name+'(blockID+1)') + code('offset_b = offset_'+name+'(blockID+1)') + + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& offset_b, offset_b+nelem)') + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + else: + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + comm('kernel call') + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, sliceEnd)') + ENDDO() + code('!$OMP END PARALLEL DO') + + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i1','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','1',dims[g_m]+'+1') + code('opDat'+str(g_m+1)+'Local(i2) = opDat'+str(g_m+1)+'Local(i2) + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2)') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_kernel.CUF','w') + else: + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text) + fid.close() + +########################################################################## +# Assemble Hydra master file +########################################################################## +def op2_gen_cuda_hydra(): + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + file_text = '' + code('MODULE HYDRA_CUDA_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + comm('Constant declarations') + code('#include "hydra_constants.inc"') + code('') + comm('Loop-specific global variables') + file_text += header_text + + code('') + code('CONTAINS') + code('') + code('#include "hydra_constants_set.inc"') + code('#include "flux_low_gpufun.inc"') + code('#include "bcs_kernels_gpufun.inc"') + code('#include "update_kernels_gpufun.inc"') + + file_text += body_text + code('END MODULE') + fid = open('hydra_kernels.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_cuda_color2.py b/translator/fortran/op2_gen_cuda_color2.py new file mode 100644 index 000000000..0ab0d552b --- /dev/null +++ b/translator/fortran/op2_gen_cuda_color2.py @@ -0,0 +1,1716 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import sys +import util +import pprint + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +arg_parse=util.arg_parse +replace_consts=util.replace_consts +replace_npdes=util.replace_npdes +get_stride_string=util.get_stride_string +replace_soa = util.replace_soa +find_function_calls=util.find_function_calls + +def op2_gen_cuda_color2(master, date, consts, kernels, hydra, bookleaf): + +# global util.funlist, util.const_list + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + util.funlist = [] + util.const_list = [] + + header_text = '' + body_text = '' + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + hybrid = 0 +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + +# +# set two logicals +# + + j = -1 + reduct_mdim = 0 + reduct_1dim = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and (accs[i] == OP_INC or accs[i] == OP_MAX or accs[i] == OP_MIN): + j = i + if (not dims[i].isdigit()) or int(dims[i])>1: + reduct_mdim = 1 + else: + reduct_1dim = 1 + if maps[i] == OP_GBL and accs[i] == OP_WRITE: + j = i + reduct = reduct_1dim or reduct_mdim + + #npdes->DNPDE + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()) and maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if 'npdes' in dims[g_m]: + dims[g_m] = dims[g_m].replace('npdes','DNPDE') + for g_m in range(0,ninds): + if (not inddims[g_m].isdigit()) and indaccs[g_m] == OP_INC: + if 'npdes' in dims[g_m]: + inddims[g_m] = inddims[g_m].replace('npdes','DNPDE') + +#atomics option + ind_rw=0 + ind_inc=0 + atomics=0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_RW: + ind_rw=1 + if maps[i] == OP_MAP and accs[i] == OP_INC: + ind_inc=1 + if not ind_rw and ind_inc: + atomics = 1 +# atomics=0 + + +# for g_m in range(0,nargs): +# if dims[g_m] == 'NPDE': +# dims[g_m] = '6' + +# if ('GRADL_EDGECON' in name): +# for g_m in range(0,nargs): +# if 'NPDE' in dims[g_m]: +# dims[g_m] = dims[g_m].replace('NPDE','6') +# try: +# newdim = str(eval(dims[g_m])) +# dims[g_m] = newdim +# except NameError as inst: +# dims[g_m] +# #do nothing + + atomic_reduction = 0 + if 'MP_FOURIER_CDCH' in name or 'intclineprim' in name or 'accumintegrals' in name: + atomic_reduction = 1 + unknown_reduction_size = 0 + unknown_size_red_atomic = [0]*nargs + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()): + found=0 + if found==0: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + if atomic_reduction == 1: + unknown_reduction_size = 2 + unknown_size_red_atomic[g_m] = 1 + else: + unknown_reduction_size = 1 + soaflags[g_m] = 1 + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + for i in range(0,nargs): + if maps[i]==OP_MAP: + dims[i] = dims[invinds[inds[i]-1]] + + stage_flags = [0]*nargs + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + permute = 0 + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][9:]+'_module_MODULE') + modfile = kernels[nk]['mod_file'][9:]+'_module' + filename = 'kernels/'+kernels[nk]['master_file']+'_'+name+'.inc' + if not os.path.isfile(filename): + files = [f for f in glob.glob('kernels/*'+name+'.inc')] + if len(files)>0: + filename = files[0] + else: + print('kernel for '+name+' not found') + fid = open(filename, 'r') + text = fid.read() + fid.close() + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), CONSTANT :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('INTEGER(kind=4), CONSTANT :: direct_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: direct_stride_OP2HOST') + dir_soa = g_m + break + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if (accs[g_m]== OP_INC or accs[g_m]== OP_MIN or accs[g_m]== OP_MAX): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)+name) + if ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opGblDat'+str(g_m+1)+'Device'+name) + + + code('') + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + code('') + if any_soa > -1: + code('#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)') + code('') + code('CONTAINS') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct_1dim or (unknown_reduction_size==1): + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(sharedDouble8, reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*) :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + code('attributes (device) SUBROUTINE ReductionInt4(sharedInt4, reductionResult,inputValue,reductionOperation)') + code('INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult') + code('INTEGER(kind=4) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedInt4(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1)') + code('CASE (1)') + IF('sharedInt4(threadID + i1) < sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(threadID + i1) > sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedInt4(0)') + code('CASE (1)') + IF('sharedInt4(0) < reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(0) > reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + if reduct_mdim: + comm('Multidimensional reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8Mdim(sharedDouble8, reductionResult,inputValue,reductionOperation,dim)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(:) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), VALUE :: dim') + code('REAL(kind=8), DIMENSION(0:*) :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: d') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2)') + ENDDO() + code('CASE (1)') + DO('i2','0','dim') +# IF('sharedDouble8(threadID*dim + i2).GT.sharedDouble8((threadID + i1)*dim + i2)') + code('sharedDouble8(threadID*dim + i2) = MIN(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + #ENDIF() + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = MAX(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + ENDDO() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1)') + code('CASE (1)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MIN(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MAX(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + + +########################################################################## +# Inline user kernel function +########################################################################## + using_consts = 0 + if hydra: + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + #text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','subroutine') + if hybrid == 1: + text = text.replace('subroutine '+name, 'attributes(host) subroutine '+name) + file_text += text + code('') + code('') + #remove all comments + util.const_list = [] + text = re.sub('!.*\n','\n',text) + text = replace_consts(text) + using_consts = text.find('use HYDRA_CONST_MODULE')>=0 + text = text.replace('subroutine '+name, 'attributes(device) subroutine '+name+'_gpu',1) + + #find subroutine calls + util.funlist = [name.lower()] + util.funlist2 = [] + plus_kernels, text = find_function_calls(text,'attributes(device) ',name+'_gpu') + funcs = util.replace_soa_subroutines(util.funlist2,0,soaflags,maps,accs,mapnames,1,hydra,bookleaf,unknown_size_red_atomic,[],atomics) + text = '' + for func in funcs: + text = text + '\n' + func['function_text'] + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) + text = text.replace('cop2rep','!cop2rep') + + #strip "use" statements + i = re.search('\\buse\\b',text.lower()) + i_offset = 0 + while not (i is None): + i_offset = i_offset+i.start() + if not ('HYDRA_CONST_MODULE' in text[i_offset:i_offset+23]): + text = text[0:i_offset]+'!'+text[i_offset:] + i_offset = i_offset+4 + i = re.search('\\buse\\b',text[i_offset:].lower()) + + + file_text += text + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += 'attributes (host) subroutine ' + name + '' + text[i+ 11 + len(name):j]+'\n\n' + kern_text = 'attributes (device) subroutine ' + name + '_gpu' + text[i+ 11 + len(name):j]+'_gpu\n\n' + for const in range(0,len(consts)): + i = re.search('\\b'+consts[const]['name']+'\\b',kern_text) + if i != None: +# print 'Found ' + consts[const]['name'] + j = i.start() + kern_text = kern_text[0:j+1] + re.sub('\\b'+consts[const]['name']+'\\b',consts[const]['name']+'_OP2',kern_text[j+1:]) + + text = replace_soa(kern_text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + if any_soa: + text = re.sub('\\bDIMENSION\([A-Za-z0-9_]*\)','DIMENSION(*)',text) + file_text += text + + else: + depth -= 2 + code('attributes (host) &') + code('#include "'+name+'.inc"') + code('attributes (device) &') + fid = open(name+'.inc2', 'r') + text = fid.read() +# text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf,[],atomics) + #find subroutine calls + util.funlist = [name.lower()] + util.funlist2 = [] + plus_kernels, text = find_function_calls(text,'attributes(device) ',name+'_gpu') + funcs = util.replace_soa_subroutines(util.funlist2,0,soaflags,maps,accs,mapnames,1,hydra,bookleaf,unknown_size_red_atomic,[],atomics) + text = '' + for func in funcs: + text = text + '\n' + func['function_text'] + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) + code(text) + depth += 2 + code('') + + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX) and unknown_size_red_atomic[g_m] != 1: + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opGblDat'+str(g_m+1)+'Device'+name+', &') + + if ninds > 0: #indirect loop + code('& start, end, &') + if not atomics: + code('& pcol_reord, &') + code('& setSize)') + else: #direct loop + code('& setSize)') + + code('') + if hydra and using_consts: + code('use HYDRA_CONST_MODULE') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(invinds[g_m]+1)+'Dim') + if indaccs[g_m]==OP_READ: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + else: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_READ: + code(typs[g_m]+', DEVICE, INTENT(IN) :: opDat'+str(g_m+1)+'Device'+name+'(*)') + else: + code(typs[g_m]+', DEVICE :: opDat'+str(g_m+1)+'Device'+name+'(*)') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + #and additionally we need registers to store contributions, depending on dim: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opGblDat'+str(g_m+1)+'Device'+name) + elif unknown_size_red_atomic[g_m]==0: + if g_m in needDimList: + code(typs[g_m]+', DEVICE :: scratchDevice'+str(g_m+1)+'(*)') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opGblDat'+str(g_m+1)+'Device'+name) + else: + #if it's not a global reduction, and multidimensional then we pass in a device array + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + if accs[g_m] == OP_READ: #if OP_READ and dim 1, we can pass in by value + code(typs[g_m]+', VALUE :: opGblDat'+str(g_m+1)+'Device'+name) + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + + if ninds > 0: #indirect loop + code('INTEGER(kind=4), VALUE :: start, end') + if not atomics: + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pcol_reord') + code('INTEGER(kind=4), VALUE :: setSize') + code('') + + code('INTEGER(kind=4) :: i3') + + else: #direct loop + code('INTEGER(kind=4), VALUE :: setSize') + + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + if unknown_reduction_size == 1: + code('INTEGER(kind=4) :: thrIdx') + + if reduct: + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and unknown_size_red_atomic[g_m]==0: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:*), SHARED :: redFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: redInt4') + + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if g_m in needDimList: + print('Error, cannot statically determine dim of argument '+str(g_m+1)+' in kernel '+name) + sys.exit(-1) + code(typs[g_m]+', DIMENSION('+dims[g_m]+') :: opDat'+str(g_m+1)+'Staged') + + code('') + if unknown_reduction_size == 1: + code('thrIdx = threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and unknown_size_red_atomic[g_m]==0: + if accs[g_m] == OP_INC: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = 0') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = 0') + elif accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1)') + else: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1)') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:(blockIdx%x - 1)*('+dims[g_m]+') + ('+dims[g_m]+'))') + + + code('') + if ninds > 0: + code('') + code('i1 = threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x') + IF('i1+start 0: #indirect kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif soaflags[g_m] == 1 and maps[g_m] != OP_GBL:# and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i3)' + elif maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '(i3 * ('+dims[g_m]+') +1' + \ + ':i3 * ('+dims[g_m]+') + ('+dims[g_m]+'))' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '(i3 * ('+dims[g_m]+') +1)' + elif maps[g_m] == OP_MAP :# and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'):'+ \ + ' map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + if unknown_size_red_atomic[g_m] == 0: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& reductionArrayDevice'+str(g_m+1)+name + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + i2 * '+get_stride_string(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Device'+name+ '(1 + i2 * direct_stride_OP2CONSTANT + i3) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + ENDIF() #if i2+start < end + else: #direct kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif soaflags[g_m] == 1 and maps[g_m] != OP_GBL and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC):# and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i1)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + if unknown_size_red_atomic[g_m] == 0: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& reductionArrayDevice'+str(g_m+1)+name + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 * ('+dims[g_m]+') + 1: i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Device'+name+ '(1 + i2 * direct_stride_OP2CONSTANT + i1) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1) ') + ENDDO() + if optflags[g_m]==1: + ENDIF() + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX) and unknown_size_red_atomic[g_m] == 0: + if accs[g_m] == OP_INC: + op = '0' + elif accs[g_m] == OP_MIN: + op = '1' + elif accs[g_m] == OP_MAX: + op = '2' + if 'real' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionFloat8(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionFloat8(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionFloat8Mdim(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + elif 'integer' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionInt4(redInt4, reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionInt4(redInt4, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionInt4Mdim(redInt4, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CPU hust stub +########################################################################## + + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + IF('getHybridGPU().EQ.1') + code('CALL '+name+'_host_gpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + if hybrid == 1: + ELSE() + code('CALL '+name+'_host_cpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ENDIF() + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for GPU execution') + code('') + code('attributes (host) SUBROUTINE '+name+'_host_gpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + if util.const_list: + code('use HYDRA_CONST_MODULE') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(invinds[g_m]+1)+'Device'+name) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: opMap'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(g_m+1)+'Device'+name) + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: mappingArray'+str(invinds[g_m]+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + if not atomics: + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pcol_reord') + code('INTEGER(kind=4), DIMENSION(:), POINTER :: color2_offsets') + else: + code('INTEGER(kind=4) :: itstart, itend') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + if not atomics: + code('REAL(kind=4) :: dataTransfer, dataTransfer2') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + code('INTEGER(kind=4), SAVE :: calledTimes=0') + code('INTEGER(kind=4) :: istat') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code(typs[g_m]+', DIMENSION(:), POINTER :: opDat'+str(g_m+1)+'Host') + else: + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Host_tmp') #XLF workaround + if (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + if g_m in needDimList and unknown_size_red_atomic[g_m] == 0: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: scratchDevice'+str(g_m+1)) + code('INTEGER(kind=4) :: scratchDevice'+str(g_m+1)+'Size') + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + #code('print *,"'+name+'"') + + #managing constants + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(calledTimes.EQ.0).OR.(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(g_m+1)+'))') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + ENDIF() + if dir_soa!=-1: + IF('(calledTimes.EQ.0).OR.(direct_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(dir_soa+1)+'))') + code('direct_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(dir_soa+1)+')') + code('direct_stride_OP2CONSTANT = direct_stride_OP2HOST') + ENDIF() + + #TODO: this is terrible + # for const in util.const_list: + # code(const+'_OP2CONSTANT = '+const) + + code('call op_timers_core(startTime)') + code('') + #code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('n_upper = op_mpi_halo_exchanges_grouped(set%setCPtr,numberOfOpDats,opArgArray,2)') + if not atomics: + IF('n_upper.GT.0') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + #code('partitionSize = OP_PART_SIZE_ENV') + code('') + if not atomics: + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,4)') + code('') + else: + code('') + if unknown_reduction_size == 1: + code('blocksPerGrid = 200') + else: + code('blocksPerGrid = 600') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opMap'+str(invinds[g_m]+1)+'Cardinality = set%setPtr%size * getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + code('') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data_d,opMap'+str(invinds[g_m]+1)+'Device'+name+',(/opMap'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m])>1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host,(/opDat'+str(g_m+1)+'Cardinality/))') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('opDat'+str(g_m+1)+'Host_tmp = opDat'+str(g_m+1)+'Host') #XLF workaround + code('') + + if ninds > 0 and not atomics: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%color2_offsets,color2_offsets,(/actualPlan_'+name+'%ncolors+1/))') + code('CALL c_f_pointer(actualPlan_'+name+'%col_reord,pcol_reord,(/set%setPtr%size+set%setPtr%exec_size/))') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + IF('.not. allocated(opGblDat'+str(g_m+1)+'Device'+name+')') + code('allocate(opGblDat'+str(g_m+1)+'Device'+name+'(opArg'+str(g_m+1)+'%dim))') + ENDIF() + code('opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim) = opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim)') + if ninds>0 and reduct: + code('blocksPerGrid=0') + if not atomics: + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = blocksPerGrid+(color2_offsets(i2+2)-color2_offsets(i2+1)-1)/threadsPerBlock+1') + ENDDO() + else: + code('blocksPerGrid = (set%setPtr%size+set%setPtr%exec_size-1)/threadsPerBlock+1') + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + if unknown_size_red_atomic[g_m] == 0: + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + else: + code('reductionCardinality'+str(g_m+1)+' = 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + IF ('.not. allocated(reductionArrayDevice'+str(g_m+1)+name+')') + code('allocate( reductionArrayDevice'+str(g_m+1)+name+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + ENDIF() + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.0') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = 0.0') + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = opDat'+str(g_m+1)+'Host') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Host') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+name+' = reductionArrayHost'+str(g_m+1)+'') + + code('') + if unknown_reduction_size==1: + if ninds>0: + code('blocksPerGrid = 0') + if not atomics: + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = MAX(blocksPerGrid,(color2_offsets(i2+2)-color2_offsets(i2+1)-1)/threadsPerBlock+1)') + ENDDO() + else: + code('blocksPerGrid = MAX((set%setPtr%core_size-1)/threadsPerBlock+1,(set%setPtr%size+set%setPtr%exec_size-set%setPtr%core_size-1)/threadsPerBlock+1)') + code('call prepareScratch(opArgArray,numberOfOpDats,blocksPerGrid*threadsPerBlock)') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN) and (g_m in needDimList): + #IF('opArg'+str(g_m+1)+'%dim.gt.20') + #code('print *,"'+name+'",'+str(g_m+1)+',opArg'+str(g_m+1)+'%dim') + #ENDIF() + code('scratchDevice'+str(g_m+1)+'Size = opArg'+str(g_m+1)+'%dim*blocksPerGrid*threadsPerBlock') + code('call c_f_pointer(opArgArray('+str(g_m+1)+')%data_d,scratchDevice'+str(g_m+1)+',(/scratchDevice'+str(g_m+1)+'Size/))') + + + #indirect loop host stub call + if ninds > 0: + if not atomics: + DO('i2','0','actualPlan_'+name+'%ncolors') + IF('i2 .EQ. actualPlan_'+name+'%ncolors_core') + #code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,2)') + ENDIF() + code('') + code('blocksPerGrid = (color2_offsets(i2+2)-color2_offsets(i2+1)-1)/threadsPerBlock+1') + else: + DO('i2','0','2') + IF('i2 .EQ. 1') + code('itstart = set%setPtr%core_size') + code('itend = n_upper') + #code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,2)') + ELSE() + code('itstart = 0') + code('itend = set%setPtr%core_size') + ENDIF() + code('') + code('blocksPerGrid = (itend-itstart-1)/threadsPerBlock+1') + + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + IF('blocksPerGrid.gt.0') + #code('print *,"'+name+'", blocksPerGrid') + code('CALL op_cuda_'+name+' <<>> (&') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opMap'+str(invinds[inds[g_m]-1]+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if g_m in needDimList and unknown_size_red_atomic[g_m] == 0: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host_tmp, &') #XLF workaround + if not atomics: + code('& color2_offsets(i2+1), color2_offsets(i2+2), &') + code('& pcol_reord,set%setPtr%size+set%setPtr%exec_size)') + else: + code('& itstart, itend, &') + code('& set%setPtr%size+set%setPtr%exec_size)') + ENDIF() + ENDDO() + code('') + else: #direct loop host stub call + code('CALL op_cuda_'+name+' <<>>( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if g_m in needDimList and unknown_size_red_atomic[g_m] == 0: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host_tmp, &') #XLF workaround + code('set%setPtr%size)') + code('') + if not atomics: + ENDIF() + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. set%setPtr%core_size)') + #code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,2)') + ENDIF() + code('') + + code('') + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim) = opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim)') + + if reduct: + #reductions + IF('n_upper.GT.0') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+name+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = opDat'+str(g_m+1)+'Host + reductionArrayHost'+str(g_m+1)+'(i10+1)') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') + reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+'))') + elif accs[g_m] == OP_MIN: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MIN(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MIN(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + elif accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MAX(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MAX(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + ENDDO() + code('') + if optflags[g_m] == 1: + ENDIF() + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') +# code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + ENDIF() + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + else: + print('Error, reduction type '+typs[g_m]+' unrecognised') + code('') + if optflags[g_m] == 1: + ENDIF() + + code('istat = cudaDeviceSynchronize()') + IF('istat.ne.0') + code('print *,cudaGetErrorString(istat)') + ENDIF() + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + + if ninds > 0: + if not atomics: + code('dataTransfer = 0.0') + code('dataTransfer2 = 0.0') + IF('n_upper.GT.0') + code('dataTransfer = actualPlan_'+name+'%transfer') + code('dataTransfer2 = actualPlan_'+name+'%transfer2') + ENDIF() + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime, dataTransfer,dataTransfer2, 1)') + else: + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime, 0.00000_4, 0.00000_4, 1)') + else: + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('') + if hybrid == 1: + code('') + comm('Stub for CPU execution') + code('') +########################################################################## +# Generate OpenMP host stub +########################################################################## +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +############################################################################ +### Generate OpenMP host stub +############################################################################ + code('SUBROUTINE '+name+'_host_cpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('END SUBROUTINE') + code('END MODULE') +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'_'+name + fid = open(name+'_gpukernel.CUF','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_gpukernel.CUF','w') + else: + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() + +########################################################################## +# Assemble Hydra master file +########################################################################## +def op2_gen_cuda_hydra(): + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + file_text = '' + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + comm('Constant declarations') + code('#include "hydra_constants.inc"') + code('') + comm('Loop-specific global variables') + file_text += header_text + + code('') + code('CONTAINS') + code('') + code('#include "hydra_constants_set.inc"') + code('#include "flux_low_gpufun.inc"') + code('#include "bcs_kernels_gpufun.inc"') + code('#include "update_kernels_gpufun.inc"') + + file_text += body_text + code('END MODULE') + fid = open('hydra_kernels.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_cuda_gbl.py b/translator/fortran/op2_gen_cuda_gbl.py new file mode 100644 index 000000000..213bc2bd9 --- /dev/null +++ b/translator/fortran/op2_gen_cuda_gbl.py @@ -0,0 +1,1644 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import sys +import util + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +arg_parse=util.arg_parse +replace_consts=util.replace_consts +replace_npdes=util.replace_npdes +get_stride_string=util.get_stride_string +replace_soa = util.replace_soa +find_function_calls=util.find_function_calls + +def op2_gen_cuda_gbl(master, date, consts, kernels, hydra, bookleaf): + +# global util.funlist, util.const_list + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + util.funlist = [] + util.const_list = [] + + header_text = '' + body_text = '' + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + hybrid = 0 +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + + +# +# set two logicals +# + j = -1 + ind_rw = 0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + if maps[i] == OP_MAP and accs[i] == OP_RW: + ind_rw = 1 + ind_inc = j >= 0 + + j = -1 + reduct_mdim = 0 + reduct_1dim = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and (accs[i] == OP_INC or accs[i] == OP_MAX or accs[i] == OP_MIN): + j = i + if (not dims[i].isdigit()) or int(dims[i])>1: + reduct_mdim = 1 + else: + reduct_1dim = 1 + if maps[i] == OP_GBL and accs[i] == OP_WRITE: + j = i + reduct = reduct_1dim or reduct_mdim + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + stage_flags=[0]*nargs; + + for g_m in range(0,nargs): + if dims[g_m] == 'NPDE': + dims[g_m] = '6' + +# if ('GRADL_EDGECON' in name): + for g_m in range(0,nargs): + if 'NPDE' in dims[g_m]: + dims[g_m] = dims[g_m].replace('NPDE','6') + try: + newdim = str(eval(dims[g_m])) + dims[g_m] = newdim + except NameError as inst: + dims[g_m] + #do nothing + + + unknown_reduction_size = 0 + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()): + found=0 + for string in ['NPDE','DNTQMU','DNFCROW','1*1']: + if string in dims[g_m]: + found=1 + if found==0: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + unknown_reduction_size = 1 + soaflags[g_m] = 1 + is_soa = 1 + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + permute = 0 + if ('ACCUMEDGES' in name) or ('GRADL_EDGECON' in name): + #if ('ACCUMEDGES' in name) or ('IFLUX_EDGEF' in name): + permute = 1 + + stage_inc = 0 + if ('IFLUX_EDGE' in name) or ('VFLUX_EDGE' in name): + stage_inc = 1 + + #figure out which maps to stage + ninds_staged = 0 + inds_staged = [-1]*nargs + if stage_inc: + for i in range(0,nargs): + if maps[i]==OP_MAP and accs[i]==OP_INC: + if inds_staged[invinds[inds[i]-1]] == -1: + inds_staged[i] = ninds_staged + ninds_staged = ninds_staged + 1 + else: + inds_staged[i] = inds_staged[invinds[inds[i]-1]] + invinds_staged = [-1]*ninds_staged + inddims_staged = [-1]*ninds_staged + indopts_staged = [-1]*ninds_staged + if stage_inc: + for i in range(0,nargs): + if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: + invinds_staged[inds_staged[i]] = i + inddims_staged[inds_staged[i]] = dims[i] + if optflags[i] == 1: + indopts_staged[inds_staged[i]] = i + for i in range(0,nargs): + inds_staged[i] = inds_staged[i] + 1 + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + if hydra: + code('USE HYDRA_STRIDE_MODULE') + code('') + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), CONSTANT :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('INTEGER(kind=4), CONSTANT :: direct_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: direct_stride_OP2HOST') + dir_soa = g_m + break + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if (accs[g_m]== OP_INC or accs[g_m]== OP_MIN or accs[g_m]== OP_MAX): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)+name) + if ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opGblDat'+str(g_m+1)+'Device'+name) + + + code('') + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + code('') + if is_soa > -1: + code('#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)') + code('') + code('CONTAINS') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct_1dim or unknown_reduction_size: + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + code('attributes (device) SUBROUTINE ReductionInt4(reductionResult,inputValue,reductionOperation)') + code('INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult') + code('INTEGER(kind=4) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: sharedInt4') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedInt4(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1)') + code('CASE (1)') + IF('sharedInt4(threadID + i1) < sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(threadID + i1) > sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedInt4(0)') + code('CASE (1)') + IF('sharedInt4(0) < reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(0) > reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + if reduct_mdim: + comm('Multidimensional reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8Mdim(reductionResult,inputValue,reductionOperation,dim)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(:) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), VALUE :: dim') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: d') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2)') + ENDDO() + code('CASE (1)') + DO('i2','0','dim') +# IF('sharedDouble8(threadID*dim + i2).GT.sharedDouble8((threadID + i1)*dim + i2)') + code('sharedDouble8(threadID*dim + i2) = MIN(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + #ENDIF() + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = MAX(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + ENDDO() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1)') + code('CASE (1)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MIN(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MAX(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + + +########################################################################## +# Inline user kernel function +########################################################################## + if hydra: + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','subroutine') + if hybrid == 1: + text = text.replace('subroutine '+name, 'attributes(host) subroutine '+name) + file_text += text + code('') + code('') + #remove all comments + util.const_list = [] + text = re.sub('!.*\n','\n',text) + text = replace_consts(text) + text = text.replace('subroutine '+name, 'attributes(device) subroutine '+name+'_gpu',1) + + + using_npdes = 0 + for g_m in range(0,nargs): + if var[g_m] == 'npdes': + using_npdes = 1 + if using_npdes==1: + text = replace_npdes(text) + + #find subroutine calls + util.funlist = [name.lower()] + plus_kernels = find_function_calls(text,'attributes(device) ') + if plus_kernels == '': + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + text = text + '\n' + plus_kernels + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) +# text = re.sub(r'\\b'+fun+'\\b',fun+'_gpu',text,flags=re.I) + + if plus_kernels != '': + for i in range(0,nargs): + if soaflags[i]==1 and not (maps[i]==OP_MAP and accs[i]==OP_INC) and not (maps[i] ==OP_GBL): + stage_flags[i] = 1; + + #strip "use" statements + i = re.search('\\buse\\b',text.lower()) + i_offset = 0 + while not (i is None): + i_offset = i_offset+i.start() + if not ('HYDRA_CONST_MODULE' in text[i_offset:i_offset+23]): + text = text[0:i_offset]+'!'+text[i_offset:] + i_offset = i_offset+4 + i = re.search('\\buse\\b',text[i_offset:].lower()) + + + file_text += text + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += 'attributes (host) subroutine ' + name + '' + text[i+ 11 + len(name):j]+'\n\n' + kern_text = 'attributes (device) subroutine ' + name + '_gpu' + text[i+ 11 + len(name):j]+'_gpu\n\n' + for const in range(0,len(consts)): + i = re.search('\\b'+consts[const]['name']+'\\b',kern_text) + if i != None: + print('Found ' + consts[const]['name']) + j = i.start() + kern_text = kern_text[0:j+1] + re.sub('\\b'+consts[const]['name']+'\\b',consts[const]['name']+'_OP2',kern_text[j+1:]) + + # + # Apply SoA to variable accesses + # + j = kern_text.find(name+'_gpu') + endj = arg_parse(kern_text,j) + while kern_text[j] != '(': + j = j + 1 + arg_list = kern_text[j+1:endj] + arg_list = arg_list.replace('&','') + varlist = ['']*nargs + leading_dim = [-1]*nargs + for g_m in range(0,nargs): + varlist[g_m] = arg_list.split(',')[g_m].strip() + for g_m in range(0,nargs): + if soaflags[g_m] and not (maps[g_m]==OP_MAP and accs[g_m]==OP_INC): + #Start looking for the variable in the code, after the function signature + loc1 = endj + p = re.compile('\\b'+varlist[g_m]+'\\b') + nmatches = len(p.findall(kern_text[loc1:])) + for id in range(0,nmatches): + #Search for the next occurence + i = p.search(kern_text[loc1:]) + #Skip commented out ones + j = kern_text[:loc1+i.start()].rfind('\n') + if j > -1 and kern_text[j:loc1+i.start()].find('!')>-1: + loc1 = loc1+i.end() + continue + + #Find closing bracket + if leading_dim[g_m] == -1: + endarg = loc1+i.start() + len(varlist[g_m]) + else: + endarg = arg_parse(kern_text,loc1+i.start()) + #Find opening bracket + beginarg = loc1+i.start() + while kern_text[beginarg] != '(': + beginarg = beginarg+1 + beginarg = beginarg+1 + + #If this is the first time we see the argument (i.e. its declaration) + if leading_dim[g_m] == -1: + if (len(kern_text[beginarg:endarg].split(',')) > 1): + #if it's 2D, remember leading dimension, and make it 1D + leading_dim[g_m] = kern_text[beginarg:endarg].split(',')[0] + kern_text = kern_text[:beginarg] + '*'+' '*(endarg-beginarg-1) + kern_text[endarg:] + else: + leading_dim[g_m] = 1 + #Continue search after this instance of the variable + loc1 = endarg+1 + else: + #If we have seen this variable already, then it's in the actual code, replace it with macro + macro = 'OP2_SOA('+kern_text[loc1+i.start():loc1+i.end()]+',' + if leading_dim[g_m] == 1: + macro = macro + kern_text[beginarg:endarg] + else: + macro = macro + kern_text[beginarg:endarg].split(',')[0] + '+('+kern_text[beginarg:endarg].split(',')[1]+'-1)*'+leading_dim[g_m] + if maps[g_m] == OP_MAP: + if 'el2node' in mapnames[g_m]: + macro = macro + ', nodes_stride_OP2)' + elif 'el2el' in mapnames[g_m]: + macro = macro + ', elements_stride_OP2)' + elif 'el2reg' in mapnames[g_m]: + macro = macro + ', reg_stride_OP2)' + else: + macro = macro + ', ' + set_name.strip()[2:]+'_stride_OP2)' + kern_text = kern_text[:loc1+i.start()] + macro + kern_text[endarg+1:] + #Continue search after this instance of the variable + loc1 = loc1+i.start() + len(macro) + file_text += kern_text + + else: + depth -= 2 + code('attributes (host) &') + code('#include "'+name+'.inc"') + code('attributes (device) &') + fid = open(name+'.inc2', 'r') + text = fid.read() + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + code(text) + depth += 2 + code('') + + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + code('& reductionArrayDevice'+str(g_m+1)+', &') + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opGblDat'+str(g_m+1)+'Device'+name+', &') + + if ninds > 0: #indirect loop + if stage_inc: + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + code('& ind_sizes, &') + code('& ind_offs, &') + + code('& pcol_reord, &') + code('& setSize, &') + code('& exec_count)') + else: #direct loop + code('& setSize)') + + code('') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(invinds[g_m]+1)+'Dim') + if indaccs[g_m]==OP_READ: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + else: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_READ: + code(typs[g_m]+', DEVICE, INTENT(IN) :: opDat'+str(g_m+1)+'Device'+name+'(*)') + else: + code(typs[g_m]+', DEVICE :: opDat'+str(g_m+1)+'Device'+name+'(*)') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + #if it's a global reduction, then we pass in a reductionArrayDevice + code(typs[g_m]+', DIMENSION(:), DEVICE :: reductionArrayDevice'+str(g_m+1)) + #and additionally we need registers to store contributions, depending on dim: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opGblDat'+str(g_m+1)+'Device'+name) + else: + if g_m in needDimList: + code(typs[g_m]+', DEVICE :: scratchDevice'+str(g_m+1)+'(*)') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opGblDat'+str(g_m+1)+'Device'+name) + else: + #if it's not a global reduction, and multidimensional then we pass in a device array + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + if accs[g_m] == OP_READ: #if OP_READ and dim 1, we can pass in by value + code(typs[g_m]+', VALUE :: opGblDat'+str(g_m+1)+'Device'+name) + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + + if ninds > 0: #indirect loop + + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pcol_reord') + code('INTEGER(kind=4), VALUE :: exec_count') + code('INTEGER(kind=4), VALUE :: setSize') + code('INTEGER(kind=4) :: i3') + code('') + + else: #direct loop + code('INTEGER(kind=4), VALUE :: setSize') + + code('INTEGER(kind=4) :: i1') + if reduct: + code('INTEGER(kind=4) :: i2') + + if unknown_reduction_size: + code('INTEGER(kind=4) :: thrIdx') + + code('') + if unknown_reduction_size: + code('thrIdx = threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = 0') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = 0') + elif accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1)') + else: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1)') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:(blockIdx%x - 1)*('+dims[g_m]+') + ('+dims[g_m]+'))') + + code('') + if ninds>0: + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','exec_count','blockDim%x * gridDim%x') + code('i3 = pcol_reord(i1)') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): + k = k + [(optflags[g_m]*nargs+mapinds[g_m])] + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + setSize * '+str(int(idxs[g_m])-1)+')') + ENDIF() + else: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + setSize * '+str(int(idxs[g_m])-1)+')') + code('') + comm('kernel call') + + else: + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','setSize','blockDim%x * gridDim%x') + code('') + comm('kernel call') + code('') + +########################################################################## +# CUDA kernel call +########################################################################## + if ninds > 0: #indirect kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and maps[g_m] != OP_GBL: + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i3)' + elif maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '(i3 * ('+dims[g_m]+') +1' + \ + ':i3 * ('+dims[g_m]+') + ('+dims[g_m]+'))' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '(i3 * ('+dims[g_m]+') +1)' + elif maps[g_m] == OP_MAP:# and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'):'+ \ + ' map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + + ENDDO() + else: #direct kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if soaflags[g_m] == 1 and maps[g_m] != OP_GBL and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC):# and optflags[g_m]==0: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i1)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 * ('+dims[g_m]+') + 1: i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if accs[g_m] == OP_INC: + op = '0' + elif accs[g_m] == OP_MIN: + op = '1' + elif accs[g_m] == OP_MAX: + op = '2' + if 'real' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionFloat8(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionFloat8(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionFloat8Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + elif 'integer' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionInt4(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionInt4(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionInt4Mdim(reductionArrayDevice'+str(g_m+1)+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CPU hust stub +########################################################################## + + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + IF('getHybridGPU().EQ.1') + code('CALL '+name+'_host_gpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + if hybrid == 1: + ELSE() + code('CALL '+name+'_host_cpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ENDIF() + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for GPU execution') + code('') + code('attributes (host) SUBROUTINE '+name+'_host_gpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + if util.const_list: + code('use HYDRA_CONST_MODULE') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(invinds[g_m]+1)+'Device'+name) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: opMap'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(g_m+1)+'Device'+name) + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: mappingArray'+str(invinds[g_m]+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pcol_reord') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: exec_size') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + if reduct: + code('INTEGER(kind=4) :: blockOffset') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + + code('INTEGER(kind=4), SAVE :: calledTimes=0') + code('INTEGER(kind=4) :: istat') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code(typs[g_m]+', DIMENSION(:), POINTER :: opDat'+str(g_m+1)+'Host') + else: + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + if g_m in needDimList: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: scratchDevice'+str(g_m+1)) + code('INTEGER(kind=4) :: scratchDevice'+str(g_m+1)+'Size') + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000_4,0.00000_4, 0)') + + #managing constants + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(calledTimes.EQ.0).OR.(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(g_m+1)+'))') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + ENDIF() + if dir_soa!=-1: + IF('(calledTimes.EQ.0).OR.(direct_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(dir_soa+1)+'))') + code('direct_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(dir_soa+1)+')') + code('direct_stride_OP2CONSTANT = direct_stride_OP2HOST') + ENDIF() + + #TODO: this is terrible + # for const in util.const_list: + # code(const+'_OP2CONSTANT = '+const) + + code('call op_timers_core(startTime)') + code('') + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + #code('partitionSize = OP_PART_SIZE_ENV') + code('') + code('opSetCore => set%setPtr') + code('exec_size = opSetCore%size + opSetCore%exec_size') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,4)') + code('') + else: + code('') + if unknown_reduction_size: + code('blocksPerGrid = 100') + else: + code('blocksPerGrid = 600') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opMap'+str(invinds[g_m]+1)+'Cardinality = exec_size * getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + code('') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data_d,opMap'+str(invinds[g_m]+1)+'Device'+name+',(/opMap'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m])>1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host,(/opDat'+str(g_m+1)+'Cardinality/))') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + code('') + + if ninds > 0: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%col_reord,pcol_reord,(/exec_size/))') + code('CALL c_f_pointer(actualPlan_'+name+'%color2_offsets,offset_'+name+',(/actualPlan_'+name+'%ncolors+1/))') + + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + IF('.not. allocated(opGblDat'+str(g_m+1)+'Device'+name+')') + code('allocate(opGblDat'+str(g_m+1)+'Device'+name+'(opArg'+str(g_m+1)+'%dim))') + ENDIF() + code('opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim) = opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim)') + if ninds>0 and reduct: + code('blocksPerGrid=0') + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = blocksPerGrid+(offset_'+name+'(i2+2) - offset_'+name+'(i2+1)-1)/threadsPerBlock+1') + ENDDO() + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + IF ('.not. allocated(reductionArrayDevice'+str(g_m+1)+name+')') + code('allocate( reductionArrayDevice'+str(g_m+1)+name+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + ENDIF() + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.0') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = 0.0') + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = opDat'+str(g_m+1)+'Host') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Host') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+name+' = reductionArrayHost'+str(g_m+1)+'') + + code('') + if unknown_reduction_size: + if ninds>0: + code('blocksPerGrid = 0') + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = MAX(blocksPerGrid,ncolblk(i2+1))') + ENDDO() + code('call prepareScratch(opArgArray,numberOfOpDats,blocksPerGrid*threadsPerBlock)') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN) and (g_m in needDimList): + code('scratchDevice'+str(g_m+1)+'Size = opArg'+str(g_m+1)+'%dim*blocksPerGrid*threadsPerBlock') + code('call c_f_pointer(opArgArray('+str(g_m+1)+')%data_d,scratchDevice'+str(g_m+1)+',(/scratchDevice'+str(g_m+1)+'Size/))') + + + #indirect loop host stub call + if ninds > 0: + if reduct: + code('blockOffset = 0') + code('') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + #code('threadsPerBlock = OP_PART_SIZE_ENV') + + DO('i2','0','actualPlan_'+name+'%ncolors') + IF('i2 .EQ. 1') #actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('blocksPerGrid = (offset_'+name+'(i2+2) - offset_'+name+'(i2+1)-1)/threadsPerBlock+1') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + code('CALL op_cuda_'+name+' <<>> (&') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opMap'+str(invinds[inds[g_m]-1]+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('& reductionArrayDevice'+str(g_m+1)+name+'(blockOffset:), &') + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + + code('& pcol_reord(offset_'+name+'(i2+1)+1:),set%setPtr%size+set%setPtr%exec_size,offset_'+name+'(i2+2) - offset_'+name+'(i2+1))') + code('') + if reduct: + code('blockOffset = blockOffset + blocksPerGrid') + ENDDO() + code('') + else: #direct loop host stub call + if "UPDATEK" == name: + code('istat = cudaFuncSetCacheConfig(op_cuda_UPDATEK,cudaFuncCachePreferShared)') + code('CALL op_cuda_'+name+' <<>>( &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('& reductionArrayDevice'+str(g_m+1)+name+', &') + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host, &') + code('set%setPtr%size)') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. set%setPtr%core_size)') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('') + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim) = opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim)') + + if reduct: + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+name+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = opDat'+str(g_m+1)+'Host + reductionArrayHost'+str(g_m+1)+'(i10+1)') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') + reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+'))') + elif accs[g_m] == OP_MIN: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MIN(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MIN(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + elif accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MAX(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MAX(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') +# code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('istat = cudaDeviceSynchronize()') + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('') + if hybrid == 1: + code('') + comm('Stub for CPU execution') + code('') +########################################################################## +# Generate OpenMP host stub +########################################################################## +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +############################################################################ +### Generate OpenMP host stub +############################################################################ + code('SUBROUTINE '+name+'_host_cpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('END SUBROUTINE') + code('END MODULE') +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_gpukernel.CUF','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_gpukernel.CUF','w') + else: + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() + +########################################################################## +# Assemble Hydra master file +########################################################################## +def op2_gen_cuda_hydra(): + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + file_text = '' + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + comm('Constant declarations') + code('#include "hydra_constants.inc"') + code('') + comm('Loop-specific global variables') + file_text += header_text + + code('') + code('CONTAINS') + code('') + code('#include "hydra_constants_set.inc"') + code('#include "flux_low_gpufun.inc"') + code('#include "bcs_kernels_gpufun.inc"') + code('#include "update_kernels_gpufun.inc"') + + file_text += body_text + code('END MODULE') + fid = open('hydra_kernels.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_cuda_old.py b/translator/fortran/op2_gen_cuda_old.py new file mode 100644 index 000000000..130ff5627 --- /dev/null +++ b/translator/fortran/op2_gen_cuda_old.py @@ -0,0 +1,1005 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_cuda_old(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE OP2_CONSTANTS') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm('variable declarations') + + code('TYPE :: '+name+'_opDatDimensions') + depth = depth + 2 + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Dimension') + depth = depth - 2 + code('END TYPE '+name+'_opDatDimensions') + code('') + + code('TYPE :: '+name+'_opDatCardinalities') + depth = depth + 2 + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: ind_maps'+str(invinds[g_m]+1)+'Size') + + if ninds > 0: + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code('INTEGER(kind=4) :: mappingArray'+str(g_m+1)+'Size') + + code('INTEGER(kind=4) :: pblkMapSize') + code('INTEGER(kind=4) :: pindOffsSize') + code('INTEGER(kind=4) :: pindSizesSize') + code('INTEGER(kind=4) :: pnelemsSize') + code('INTEGER(kind=4) :: pnthrcolSize') + code('INTEGER(kind=4) :: poffsetSize') + code('INTEGER(kind=4) :: pthrcolSize') + + depth = depth - 2 + code('END TYPE '+name+'_opDatCardinalities') + code('') + + code('') + for g_m in range(0,ninds): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opDat'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opDat'+str(g_m+1)+'Device'+name) + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: ind_maps'+str(g_m+1)+'_'+name) + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code('INTEGER(kind=2), DIMENSION(:), DEVICE, ALLOCATABLE :: mappingArray'+str(g_m+1)+'_'+name) + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + comm('user function') + code('attributes (device) &') + code('#include "'+name+'.inc"') + code('') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct > 0: + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(1:1) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue(1)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + code('& opDatDimensions, &') + code('& opDatCardinalities, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code('& reductionArrayDevice'+str(g_m+1)+', &') + + if ninds > 0: #indirect loop + code('& pindSizes, &') + code('& pindOffs, &') + code('& pblkMap, &') + code('& poffset, &') + code('& pnelems, &') + code('& pnthrcol, &') + code('& pthrcol, &') + code('& blockOffset)') + else: #direct loop + code('& setSize, &') + code('& warpSize, &') + code('& sharedMemoryOffset)') + + code('') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if ninds > 0: #indirect loop + code('TYPE ( '+name+'_opDatDimensions ) , DEVICE :: opDatDimensions') + code('TYPE ( '+name+'_opDatCardinalities ) , DEVICE :: opDatCardinalities') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pindSizesSize - 1), DEVICE :: pindSizes') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pindOffsSize - 1), DEVICE :: pindOffs') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pblkMapSize - 1), DEVICE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%poffsetSize - 1), DEVICE :: poffset') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pnelemsSize - 1), DEVICE :: pnelems') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pnthrcolSize - 1), DEVICE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(0:opDatCardinalities%pthrcolSize - 1), DEVICE :: pthrcol') + code('INTEGER(kind=4), VALUE :: blockOffset') + code('') + for g_m in range(0,ninds): + if accs[invinds[g_m]] == OP_INC: + for m in range (0,int(idxs[g_m])): + code('REAL(kind=8), DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(invinds[g_m]+1+m)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1+m)+'Map') + + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'nBytes') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'nBytes') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'nBytes') + + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'RoundUp') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'RoundUp') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'RoundUp') + + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), SHARED :: opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize') + code('') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedFloat8') + code('INTEGER(kind=4) :: sharedOffsetFloat8') + code('INTEGER(kind=4), SHARED :: numOfColours') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4), SHARED :: sharedMemoryOffset') + code('INTEGER(kind=4), SHARED :: blockID') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreads') + code('INTEGER(kind=4) :: moduloResult') + code('INTEGER(kind=4) :: nbytes') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('INTEGER(kind=4) :: n1') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + + IF('threadIdx%x - 1 .EQ. 0') + code('blockID = pblkMap(blockIdx%x - 1 + blockOffset)') + code('numberOfActiveThreads = pnelems(blockID)') + code('numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)') + code('numOfColours = pnthrcol(blockID)') + code('sharedMemoryOffset = poffset(blockID)') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize = pindSizes('+str(g_m)+' + blockID * '+str(ninds)+')') + ENDIF() + code('') + code('CALL syncthreads()') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'RoundUp = opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize * opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension') + code('') + for g_m in range(0,ninds): + if g_m == 0: + code('opDat'+str(invinds[g_m]+1)+'nBytes = 0') + else: + code('opDat'+str(invinds[g_m]+1)+'nBytes = opDat'+str(invinds[g_m-1]+1)+'nBytes * 8 / 8 + opDat'+str(invinds[g_m-1]+1)+'RoundUp * 8 / 8') + code('') + + for g_m in range(0,ninds): + code('i1 = threadIdx%x - 1') + code('n1 = opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize * opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension') + if accs[invinds[g_m]] == OP_READ: + DOWHILE('i1 < n1') + code('moduloResult = mod(i1,opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension)') + code('sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes + i1) = opDat'+str(invinds[g_m]+1)+'Device'+name+'( &') + code('& moduloResult + ind_maps'+str(invinds[g_m]+1)+'_'+name+'(0 + (pindOffs('+str(g_m)+' + blockID * '+str(ninds)+') + i1 / &') + code('& opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension) + 1) * &') + code('& opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension + 1)') + code('i1 = i1 + blockDim%x') + ENDDO() + elif accs[invinds[g_m]] == OP_INC: + DOWHILE('i1 < n1') + code('sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes + i1) = 0') + code('i1 = i1 + blockDim%x') + ENDDO() + code('') + + code('CALL syncthreads()') + code('i1 = threadIdx%x - 1') + code('') + + + DOWHILE('i1 < numberOfActiveThreadsCeiling') + code('colour2 = -1') + IF('i1 < numberOfActiveThreads') + for g_m in range(0,ninds): + if accs[invinds[g_m]] == OP_INC: + for m in range (0,int(idxs[g_m])): + DO('i2','0','opDatDimensions%opDat'+str(invinds[g_m]+1+m)+'Dimension') + code('opDat'+str(invinds[g_m]+1+m)+'Local(i2) = 0') + ENDDO() + + else: #direct loop + code('TYPE ( '+name+'_opDatDimensions ) , DEVICE :: opDatDimensions') + code('TYPE ( '+name+'_opDatCardinalities ) , DEVICE :: opDatCardinalities') + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + else: #global arg + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Local') + code(typs[g_m]+', DIMENSION(:), DEVICE :: reductionArrayDevice'+str(g_m+1)) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + code(typs[g_m]+', DIMENSION(:), DEVICE :: reductionArrayDevice'+str(g_m+1)) + + + code('INTEGER(kind=4), VALUE :: setSize') + code('INTEGER(kind=4), VALUE :: warpSize') + code('INTEGER(kind=4), VALUE :: sharedMemoryOffset') + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedFloat8') + code('INTEGER(kind=4) :: sharedOffsetFloat8') + code('INTEGER(kind=4) :: numberOfActiveThreads') + code('INTEGER(kind=4) :: localOffset') + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + + +########################################################################## +# CUDA kernel call +########################################################################## + if ninds > 0: #indirect kernel call + code('') + comm('kernel call') + line = ' CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i1 + sharedMemoryOffset) * opDatDimensions%opDat'+str(g_m+1)+ \ + 'Dimension + 1:(i1 + sharedMemoryOffset) * opDatDimensions%opDat'+ \ + str(g_m+1)+'Dimension + opDatDimensions%opDat'+str(g_m+1)+\ + 'Dimension + 1 + 1)' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+ \ + name+'((i1 + sharedMemoryOffset) * opDatDimensions%opDat'+str(g_m+1)+ \ + 'Dimension + 1)' + if maps[g_m] == OP_MAP and accs[g_m] == OP_READ: + line = line + indent + '& sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+ \ + 'nBytes + mappingArray'+str(g_m+1)+'_'+name+ \ + '(i1 + sharedMemoryOffset + 1) * opDatDimensions%opDat'+str(g_m+1)+'Dimension)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_INC or accs[g_m] == OP_RW): + line = line +indent + '& opDat'+str(g_m+1)+'Local' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('colour2 = pthrcol(i1 + sharedMemoryOffset)') + ENDIF() + + code('') + for g_m in range(0,ninds): + if accs[invinds[g_m]] == OP_INC: + for m in range (0,int(idxs[g_m])): + code('opDat'+str(invinds[g_m]+1+m)+'Map = mappingArray'+str(invinds[g_m]+1+m)+'_'+name+'(i1 + sharedMemoryOffset + 1)') + code('') + + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,ninds): + if accs[invinds[g_m]] == OP_INC: + for m in range (0,int(idxs[g_m])): + DO('i2','0', 'opDatDimensions%opDat'+str(invinds[g_m]+1+m)+'Dimension') + code('sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes + (i2 + opDat'+str(invinds[g_m]+1+m)+'Map * opDatDimensions%opDat'+str(invinds[g_m]+1+m)+'Dimension)) = &') + code('& sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes + (i2 + opDat'+str(invinds[g_m]+1+m)+'Map * opDatDimensions%opDat'+str(invinds[g_m]+1+m)+'Dimension)) + opDat'+str(invinds[g_m]+1+m)+'Local(i2)') + ENDDO() + code('') + ENDIF() + code('CALL syncthreads()') + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + code('') + code('CALL syncthreads()') + code('i1 = threadIdx%x - 1') + code('') + for g_m in range(0,ninds): + if accs[invinds[g_m]] == OP_INC: + DOWHILE('i1 < opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize * opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension') + code('moduloResult = mod(i1,opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension)') + code('opDat'+str(invinds[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds[g_m]+1)+'_'+name+' &') + code('& (0 + (pindOffs(3 + blockID * 4) + i1 / opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension) + 1) * &') + code('& opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension + 1) = &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds[g_m]+1)+'_'+name+' &') + code('& (0 + (pindOffs(3 + blockID * 4) + i1 / opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension) + 1) * &') + code('& opDatDimensions%opDat'+str(invinds[g_m]+1)+'Dimension + 1) + &') + code('& sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes + i1)') + code('i1 = i1 + blockDim%x') + ENDDO() + + else: #direct kernel call + code('') + comm('kernel call') + code('threadID = mod(threadIdx%x - 1,warpSize)') + code('sharedOffsetFloat8 = sharedMemoryOffset * ((threadIdx%x - 1) / warpSize) / 8') + code('') + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','setSize','blockDim%x * gridDim%x') + code('localOffset = i1 - threadID') + code('numberOfActiveThreads = min(warpSize,setSize - localOffset)') + for g_m in range(0,nargs): + if int(dims[g_m]) != 1 and (accs[g_m] == OP_READ or accs[g_m] == OP_RW): + DO('i2','0','opDatDimensions%opDat'+str(g_m+1)+'Dimension') + code('sharedFloat8(sharedOffsetFloat8 + (threadID + i2 * numberOfActiveThreads)) = &') + code('& opDat'+str(g_m+1)+'Device'+name+'(threadID + (i2 * numberOfActiveThreads + localOffset &') + code('& * opDatDimensions%opDat'+str(g_m+1)+'Dimension) + 1)') + ENDDO() + code('') + DO('i2','0','opDatDimensions%opDat'+str(g_m+1)+'Dimension') + code('opDat'+str(g_m+1)+'Local(i2) = sharedFloat8(sharedOffsetFloat8 + (i2 + threadID * opDatDimensions%opDat'+str(g_m+1)+'Dimension))') + ENDDO() + code('') + code('') + line = ' CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local' + else: + if int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + for g_m in range(0,nargs): + if int(dims[g_m]) != 1 and (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW): + DO('i2','0','opDatDimensions%opDat'+str(g_m+1)+'Dimension') + code('sharedFloat8(sharedOffsetFloat8 + (i2 + threadID * opDatDimensions%opDat'+str(g_m+1)+'Dimension)) = opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + code('') + DO('i2','0','opDatDimensions%opDat'+str(g_m+1)+'Dimension') + code('opDat'+str(g_m+1)+'Device'+name+'(threadID + (i2 * numberOfActiveThreads + localOffset * &') + code('& opDatDimensions%opDat'+str(g_m+1)+'Dimension) + 1) = &') + code('& sharedFloat8(sharedOffsetFloat8 + (threadID + i2 * numberOfActiveThreads))') + ENDDO() + code('') + + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code('CALL ReductionFloat8(reductionArrayDevice'+str(g_m+1)+'(blockIdx%x - 1 + 1:),opDat'+str(g_m+1)+'Local,0)') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CUP hust stub +########################################################################## + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(len='+str(len(name))+'), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: returnMPIHaloExchange') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('TYPE ( '+name+'_opDatDimensions ) , DEVICE :: opDatDimensions') + code('TYPE ( '+name+'_opDatCardinalities ) , DEVICE :: opDatCardinalities') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('TYPE ( c_devptr ) , POINTER, DIMENSION(:) :: pindMaps') + code('TYPE ( c_devptr ) , POINTER, DIMENSION(:) :: pmaps') + code('') + code('INTEGER(kind=4) :: pindMapsSize') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('INTEGER(kind=4) :: mappingArray'+str(g_m+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: pindSizesSize') + code('INTEGER(kind=4) :: pindOffsSize') + code('INTEGER(kind=4) :: pblkMapSize') + code('INTEGER(kind=4) :: poffsetSize') + code('INTEGER(kind=4) :: pnelemsSize') + code('INTEGER(kind=4) :: pnthrcolSize') + code('INTEGER(kind=4) :: pthrcolSize') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: pnindirect') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pindSizes') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pindOffs') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: poffset') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pnelems') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, ALLOCATABLE :: pthrcol') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('INTEGER(kind=4) :: returnDumpOpDat') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4) :: sharedMemoryOffset') + code('INTEGER(kind=4) :: warpSize') + code('INTEGER(kind=4), SAVE :: calledTimes') + code('INTEGER(kind=4) :: returnDumpOpDat') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + + code('INTEGER(kind=4) :: istat') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)) + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + code('returnMPIHaloExchange = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + IF('returnMPIHaloExchange .EQ. 0') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('RETURN') + ENDIF() + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + else: + code('') + code('blocksPerGrid = 200') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('warpSize = OP_WARPSIZE') + code('dynamicSharedMemorySize = 32') + code('sharedMemoryOffset = dynamicSharedMemorySize * OP_WARPSIZE') + code('dynamicSharedMemorySize = dynamicSharedMemorySize * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDatCardinalities%opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDatCardinalities%opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDatCardinalities%opDat'+str(g_m+1)+'Cardinality = set%setPtr%size') + code('') + + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code('opDatDimensions%opDat'+str(g_m+1)+'Dimension = opArg'+str(g_m+1)+'%dim') + + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = set%setPtr%size') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + code('') + + if ninds > 0: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_maps,pindMaps,(/numberOfIndirectOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk,(/set%setPtr%size/))') + code('') + code('pindSizesSize = actualPlan_'+name+'%nblocks * numberOfIndirectOpDats') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_sizes,pindSizes,(/pindSizesSize/))') + code('') + code('pindOffsSize = pindSizesSize') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_offs,pindOffs,(/pindOffsSize/))') + code('') + code('pblkMapSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,pblkMap,(/pblkMapSize/))') + code('') + code('poffsetSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,poffset,(/poffsetSize/))') + code('') + code('pnelemsSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,pnelems,(/pnelemsSize/))') + code('') + code('pnthrcolSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,pnthrcol,(/pnthrcolSize/))') + code('') + code('pthrcolSize = set%setPtr%size') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,pthrcol,(/pthrcolSize/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nindirect,pnindirect,(/numberOfIndirectOpDats/))') + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(pindMaps('+str(g_m+1)+'),ind_maps'+str(invinds[g_m]+1)+'_'+name+',pnindirect('+str(g_m+1)+'))') + code('CALL c_f_pointer(actualPlan_'+name+'%maps,pmaps,(/numberOfOpDats/))') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + IF('indirectionDescriptorArray('+str(g_m+1)+') >= 0') + code('mappingArray'+str(g_m+1)+'Size = set%setPtr%size') + code('CALL c_f_pointer(pmaps('+str(g_m+1)+'),mappingArray'+str(g_m+1)+'_'+name+',(/mappingArray'+str(g_m+1)+'Size/))') + ENDIF() + code('') + + for g_m in range(0,ninds): + code('opDatCardinalities%ind_maps'+str(invinds[g_m]+1)+'Size = pnindirect('+str(g_m+1)+')') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('opDatCardinalities%mappingArray'+str(g_m+1)+'Size = mappingArray'+str(g_m+1)+'Size') + code('') + + code('opDatCardinalities%pblkMapSize = pblkMapSize') + code('opDatCardinalities%pindOffsSize = pindOffsSize') + code('opDatCardinalities%pindSizesSize = pindSizesSize') + code('opDatCardinalities%pnelemsSize = pnelemsSize') + code('opDatCardinalities%pnthrcolSize = pnthrcolSize') + code('opDatCardinalities%poffsetSize = poffsetSize') + code('opDatCardinalities%pthrcolSize = pthrcolSize') + code('') + + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+') )') + code('allocate( reductionArrayDevice'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+') )') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.00000') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+' = reductionArrayHost'+str(g_m+1)+'') + + + #indirect loop host stub call + if ninds > 0: + code('blockOffset = 0') + code('') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = ncolblk(i2 + 1)') + code('dynamicSharedMemorySize = actualPlan_'+name+'%nshared') + code('') + code('CALL op_cuda_'+name+' <<>> &') + code('& (opDatDimensions,opDatCardinalities,pindSizes,pindOffs,pblkMap, &') + code('& poffset,pnelems,pnthrcol,pthrcol,blockOffset)') + code('') + code('threadSynchRet = cudaThreadSynchronize()') + code('blockOffset = blockOffset + blocksPerGrid') + ENDDO() + code('') + else: #direct loop host stub call + code('CALL op_cuda_'+name+' <<>> &') + code('& (opDatDimensions,opDatCardinalities, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + code('reductionArrayDevice'+str(g_m+1)+', &') + code('set%setPtr%size, &') + code('& warpSize,sharedMemoryOffset)') + code('') + code('threadSynchRet = cudaDeviceSynchronize()') + + #reduction + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + code('opDat'+str(g_m+1)+'Host = reductionArrayHost'+str(g_m+1)+'(i10+1) + opDat'+str(g_m+1)+'Host') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('istat = cudaDeviceSynchronize()') + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE '+name.upper()+'_MODULE') +########################################################################## +# output individual kernel file +########################################################################## + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_cuda_permute.py b/translator/fortran/op2_gen_cuda_permute.py new file mode 100644 index 000000000..041fe10cd --- /dev/null +++ b/translator/fortran/op2_gen_cuda_permute.py @@ -0,0 +1,2108 @@ +########################################################################## +# +# CUDA code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.CUF for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import sys +import util + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYPS',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO_STEP(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+' = '+i+' + '+step+' ){') + depth += 2 + +def DOWHILE(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO WHILE ('+line+' )') + elif CPP: + code('while ('+ line+ ' )') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+' - 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +arg_parse=util.arg_parse +replace_consts=util.replace_consts +replace_npdes=util.replace_npdes +get_stride_string=util.get_stride_string +get_stride_string_mapnames=util.get_stride_string_mapnames +replace_soa = util.replace_soa +find_function_calls=util.find_function_calls + +def op2_gen_cuda_permute(master, date, consts, kernels, hydra, bookleaf): + +# global util.funlist, util.const_list + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + util.funlist = [] + util.const_list = [] + + header_text = '' + body_text = '' + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + hybrid = 0 +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + + #for unknown dimension indirect inc, swap to OP_RW + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()) and maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if 'npdes' in dims[g_m]: + dims[g_m] = dims[g_m].replace('npdes','DNPDE') + else: + accs[g_m] = OP_RW + for g_m in range(0,ninds): + if (not inddims[g_m].isdigit()) and indaccs[g_m] == OP_INC: + if 'npdes' in dims[g_m]: + inddims[g_m] = inddims[g_m].replace('npdes','DNPDE') + else: + indaccs[g_m] = OP_RW +# +# set two logicals +# + j = -1 + ind_rw = 0 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + if maps[i] == OP_MAP and accs[i] == OP_RW: + ind_rw = 1 + ind_inc = j >= 0 + + j = -1 + reduct_mdim = 0 + reduct_1dim = 0 + for i in range(0,nargs): + if maps[i] == OP_GBL and (accs[i] == OP_INC or accs[i] == OP_MAX or accs[i] == OP_MIN): + j = i + if (not dims[i].isdigit()) or int(dims[i])>1: + reduct_mdim = 1 + else: + reduct_1dim = 1 + if maps[i] == OP_GBL and accs[i] == OP_WRITE: + j = i + reduct = reduct_1dim or reduct_mdim + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + stage_flags=[0]*nargs; + +# for g_m in range(0,nargs): +# if dims[g_m] == 'NPDE': +# dims[g_m] = '6' + +# if ('GRADL_EDGECON' in name): +# for g_m in range(0,nargs): +# if 'NPDE' in dims[g_m]: +# dims[g_m] = dims[g_m].replace('NPDE','6') +# try: +# newdim = str(eval(dims[g_m])) +# dims[g_m] = newdim +# except NameError as inst: +# dims[g_m] +# #do nothing + + + atomic_reduction = 1 + unknown_reduction_size = 0 + unknown_size_red = [0]*nargs + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()): + found=0 + for string in ['DNPDE']: #,'DNTQMU','DNFCROW','1*1']: + if string in dims[g_m]: + found=1 + if found==0: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + unknown_reduction_size = 1 + if atomic_reduction == 1: + unknown_size_red[g_m] = 1 + else: + soaflags[g_m] = 1 + is_soa = 1 + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + for i in range(0,nargs): + if maps[i]==OP_MAP: + dims[i] = dims[invinds[inds[i]-1]] + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + permute = 0 +# if ('ACCUMEDGES' in name) or ('GRADL_EDGECON' in name): + #if ('ACCUMEDGES' in name) or ('IFLUX_EDGEF' in name): +# permute = 1 + + stage_inc = 1 +# if ('IFLUX_EDGE' in name) or ('VFLUX_EDGE' in name): +# stage_inc = 1 + + #figure out which maps to stage + ninds_staged = 0 + inds_staged = [-1]*nargs + if stage_inc: + for i in range(0,nargs): + if maps[i]==OP_MAP and accs[i]==OP_INC: + if inds_staged[invinds[inds[i]-1]] == -1: + inds_staged[i] = ninds_staged + ninds_staged = ninds_staged + 1 + else: + inds_staged[i] = inds_staged[invinds[inds[i]-1]] + invinds_staged = [-1]*ninds_staged + inddims_staged = [-1]*ninds_staged + indopts_staged = [-1]*ninds_staged + if stage_inc: + for i in range(0,nargs): + if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: + invinds_staged[inds_staged[i]] = i + inddims_staged[inds_staged[i]] = dims[i] + if optflags[i] == 1: + indopts_staged[inds_staged[i]] = i + for i in range(0,nargs): + inds_staged[i] = inds_staged[i] + 1 + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][9:]+'_module_MODULE') + modfile = kernels[nk]['mod_file'][9:]+'_module' + filename = 'kernels/'+kernels[nk]['master_file']+'_'+name+'.inc' + if not os.path.isfile(filename): + files = [f for f in glob.glob('kernels/*'+name+'.inc')] + if len(files)>0: + filename = files[0] + else: + print('kernel for '+name+' not found') + fid = open(filename, 'r') + text = fid.read() + fid.close() + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + #strides for SoA + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), CONSTANT :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('INTEGER(kind=4), CONSTANT :: direct_stride_OP2CONSTANT') + code('INTEGER(kind=4) :: direct_stride_OP2HOST') + dir_soa = g_m + break + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if (accs[g_m]== OP_INC or accs[g_m]== OP_MIN or accs[g_m]== OP_MAX): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: reductionArrayDevice'+str(g_m+1)+name) + if ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + code(typs[g_m]+', DIMENSION(:), DEVICE, ALLOCATABLE :: opGblDat'+str(g_m+1)+'Device'+name) + + + code('') + + if ninds > 0: + code('TYPE ( c_ptr ) :: planRet_'+name) + code('') + if is_soa > -1: + code('#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)') + code('') + code('CONTAINS') + code('') + +########################################################################## +# Reduction kernel function - if an OP_GBL exists +########################################################################## + if reduct_1dim or unknown_reduction_size: + comm('Reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8(sharedDouble8, reductionResult,inputValue,reductionOperation)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('REAL(kind=8), DIMENSION(0:*) :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedDouble8(threadID) = sharedDouble8(threadID) + sharedDouble8(threadID + i1)') + code('CASE (1)') + IF('sharedDouble8(threadID + i1) < sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(threadID + i1) > sharedDouble8(threadID)') + code('sharedDouble8(threadID) = sharedDouble8(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedDouble8(0)') + code('CASE (1)') + IF('sharedDouble8(0) < reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('CASE (2)') + IF('sharedDouble8(0) > reductionResult(1)') + code('reductionResult(1) = sharedDouble8(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + code('attributes (device) SUBROUTINE ReductionInt4(sharedInt4, reductionResult,inputValue,reductionOperation)') + code('INTEGER(kind=4), DIMENSION(:), DEVICE :: reductionResult') + code('INTEGER(kind=4) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), DIMENSION(0:*) :: sharedInt4') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedInt4(threadID) = inputValue') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('sharedInt4(threadID) = sharedInt4(threadID) + sharedInt4(threadID + i1)') + code('CASE (1)') + IF('sharedInt4(threadID + i1) < sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(threadID + i1) > sharedInt4(threadID)') + code('sharedInt4(threadID) = sharedInt4(threadID + i1)') + ENDIF() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1) = reductionResult(1) + sharedInt4(0)') + code('CASE (1)') + IF('sharedInt4(0) < reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('CASE (2)') + IF('sharedInt4(0) > reductionResult(1)') + code('reductionResult(1) = sharedInt4(0)') + ENDIF() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + if reduct_mdim: + comm('Multidimensional reduction cuda kernel'); depth = depth +2; + code('attributes (device) SUBROUTINE ReductionFloat8Mdim(sharedDouble8, reductionResult,inputValue,reductionOperation,dim)') + code('REAL(kind=8), DIMENSION(:), DEVICE :: reductionResult') + code('REAL(kind=8), DIMENSION(:) :: inputValue') + code('INTEGER(kind=4), VALUE :: reductionOperation') + code('INTEGER(kind=4), VALUE :: dim') + code('REAL(kind=8), DIMENSION(0:*) :: sharedDouble8') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: d') + code('INTEGER(kind=4) :: threadID') + code('threadID = threadIdx%x - 1') + code('i1 = ishft(blockDim%x,-1)') + code('CALL syncthreads()') + code('sharedDouble8(threadID*dim:threadID*dim+dim-1) = inputValue(1:dim)') + + DOWHILE('i1 > 0') + code('CALL syncthreads()') + IF('threadID < i1') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = sharedDouble8(threadID*dim + i2) + sharedDouble8((threadID + i1)*dim + i2)') + ENDDO() + code('CASE (1)') + DO('i2','0','dim') +# IF('sharedDouble8(threadID*dim + i2).GT.sharedDouble8((threadID + i1)*dim + i2)') + code('sharedDouble8(threadID*dim + i2) = MIN(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + #ENDIF() + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('sharedDouble8(threadID*dim + i2) = MAX(sharedDouble8(threadID*dim + i2), sharedDouble8((threadID + i1)*dim + i2))') + ENDDO() + code('END SELECT') + ENDIF() + code('i1 = ishft(i1,-1)') + ENDDO() + + code('CALL syncthreads()') + + IF('threadID .EQ. 0') + code('SELECT CASE(reductionOperation)') + code('CASE (0)') + code('reductionResult(1:dim) = reductionResult(1:dim) + sharedDouble8(0:dim-1)') + code('CASE (1)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MIN(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('CASE (2)') + DO('i2','0','dim') + code('reductionResult(1+i2) = MAX(reductionResult(1+i2) , sharedDouble8(i2))') + ENDDO() + code('END SELECT') + ENDIF() + + code('CALL syncthreads()') + code('END SUBROUTINE') + code('') + + + +########################################################################## +# Inline user kernel function +########################################################################## + using_consts = 0 + if hydra: + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + #text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','subroutine') + if hybrid == 1: + text = text.replace('subroutine '+name, 'attributes(host) subroutine '+name) + file_text += text + code('') + code('') + #remove all comments + util.const_list = [] + text = re.sub('!.*\n','\n',text) + text = replace_consts(text) + using_consts = text.find('use HYDRA_CONST_MODULE')>=0 + text = text.replace('subroutine '+name, 'attributes(device) subroutine '+name+'_gpu',1) + + #find subroutine calls + util.funlist = [name.lower()] + util.funlist2 = [] +# print name + plus_kernels, text = find_function_calls(text,'attributes(device) ',name+'_gpu') + funcs = util.replace_soa_subroutines(util.funlist2,0,soaflags,maps,accs,mapnames,0,hydra,bookleaf) +# if name == 'SET_QB_BND': +# print name +# print '\n\n\n' +# pp = pprint.PrettyPrinter(indent=4) +# pp.pprint(funcs) + text = '' + for func in funcs: + text = text + '\n' + func['function_text'] + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) + + #strip "use" statements + i = re.search('\\buse\\b',text.lower()) + i_offset = 0 + while not (i is None): + i_offset = i_offset+i.start() + if not ('HYDRA_CONST_MODULE' in text[i_offset:i_offset+23]): + text = text[0:i_offset]+'!'+text[i_offset:] + i_offset = i_offset+4 + i = re.search('\\buse\\b',text[i_offset:].lower()) + + file_text += text + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += 'attributes (host) subroutine ' + name + '' + text[i+ 11 + len(name):j]+'\n\n' + kern_text = 'attributes (device) subroutine ' + name + '_gpu' + text[i+ 11 + len(name):j]+'_gpu\n\n' + for const in range(0,len(consts)): + i = re.search('\\b'+consts[const]['name']+'\\b',kern_text) + if i != None: +# print 'Found ' + consts[const]['name'] + j = i.start() + kern_text = kern_text[0:j+1] + re.sub('\\b'+consts[const]['name']+'\\b',consts[const]['name']+'_OP2',kern_text[j+1:]) + + text = replace_soa(kern_text,nargs,soaflags,name,maps,accs,set_name,mapnames,0,hydra,bookleaf) + if any_soa: + text = re.sub('\\bDIMENSION\([A-Za-z0-9_]*\)','DIMENSION(*)',text) + file_text += text + + else: + depth -= 2 + code('attributes (host) &') + code('#include "'+name+'.inc"') + code('attributes (device) &') + fid = open(name+'.inc2', 'r') + text = fid.read() + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,0,hydra,bookleaf) + code(text) + depth += 2 + code('') + + code('') + +########################################################################## +# Generate CUDA kernel function +########################################################################## + comm('CUDA kernel function') + code('attributes (global) SUBROUTINE op_cuda_'+name+'( &'); depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + elif accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opGblDat'+str(g_m+1)+'Device'+name+', &') + + if ninds > 0: #indirect loop + if stage_inc: + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + code('& ind_sizes, &') + code('& ind_offs, &') + + code('& pblkMap, &') + code('& poffset, &') + code('& pnelems, &') + code('& pnthrcol, &') + code('& pthrcol, &') + if permute: + code('& pcol_reord, &') + code('& setSize, &') + code('& blockOffset)') + else: #direct loop + code('& setSize)') + + code('') + if hydra and using_consts: + code('use HYDRA_CONST_MODULE') + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(invinds[g_m]+1)+'Dim') + if indaccs[g_m]==OP_READ: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + else: + code(typs[invinds[g_m]]+', DEVICE :: opDat'+str(invinds[g_m]+1)+'Device'+name+'(*)') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4), DEVICE, INTENT(IN) :: opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_READ: + code(typs[g_m]+', DEVICE, INTENT(IN) :: opDat'+str(g_m+1)+'Device'+name+'(*)') + else: + code(typs[g_m]+', DEVICE :: opDat'+str(g_m+1)+'Device'+name+'(*)') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('INTEGER(kind=4), VALUE :: opDat'+str(g_m+1)+'Dim') + if accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + #and additionally we need registers to store contributions, depending on dim: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opGblDat'+str(g_m+1)+'Device'+name) + else: + if g_m in needDimList: + code(typs[g_m]+', DEVICE :: scratchDevice'+str(g_m+1)+'(*)') + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opGblDat'+str(g_m+1)+'Device'+name) + else: + #if it's not a global reduction, and multidimensional then we pass in a device array + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + if accs[g_m] == OP_READ: #if OP_READ and dim 1, we can pass in by value + code(typs[g_m]+', VALUE :: opGblDat'+str(g_m+1)+'Device'+name) + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + + if ninds > 0: #indirect loop + if stage_inc: + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_maps'+str(invinds_staged[g_m]+1)) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), DIMENSION(0:*), DEVICE :: mappingArray'+str(g_m+1)) + code('') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_sizes') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: ind_offs') + + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pblkMap') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: poffset') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnelems') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE, INTENT(IN) :: pthrcol') + if permute: + code('INTEGER(kind=4), DIMENSION(0:*), DEVICE :: pcol_reord') + code('INTEGER(kind=4), VALUE :: blockOffset') + code('INTEGER(kind=4), VALUE :: setSize') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if inds_staged[g_m] > 0: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'SharedMap') + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Local') + else: + if g_m in needDimList: + print('Error, cannot statically determine dim of argument '+str(g_m+1)+' in kernel '+name) + sys.exit(-1) + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + + code('') + + if stage_inc: + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:*), SHARED :: sharedFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: sharedInt8') + code('') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), SHARED :: ind_maps'+str(invinds_staged[g_m]+1)+'offset') + code('INTEGER(kind=4), SHARED :: ind_maps'+str(invinds_staged[g_m]+1)+'size') + + + code('INTEGER(kind=4), SHARED :: numOfColours') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4), SHARED :: blockID') + code('INTEGER(kind=4), SHARED :: threadBlockOffset') + code('INTEGER(kind=4), SHARED :: numberOfActiveThreads') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('INTEGER(kind=4) :: n1') + code('INTEGER(kind=4) :: i3') + + if stage_inc: + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4) :: opDat'+str(invinds_staged[g_m]+1)+'nBytes') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), SHARED :: opDat'+str(invinds_staged[g_m]+1)+'RoundUp') + if ninds_staged > 0: + code('INTEGER(kind=4) moduloResult') + + else: #direct loop + code('INTEGER(kind=4), VALUE :: setSize') + + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + if unknown_reduction_size: + code('INTEGER(kind=4) :: thrIdx') + + if reduct: + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:*), SHARED :: redFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:*), SHARED :: redInt4') + + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if g_m in needDimList: + print('Error, cannot statically determine dim of argument '+str(g_m+1)+' in kernel '+name) + sys.exit(-1) + code(typs[g_m]+', DIMENSION('+dims[g_m]+') :: opDat'+str(g_m+1)+'Staged') + + code('') + if unknown_reduction_size: + code('thrIdx = threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = 0') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = 0') + elif accs[g_m] == OP_MIN or accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1)') + else: + if g_m in needDimList: + DO('i1','0',dims[g_m]) + code('scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)) = reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1)') + ENDDO() + else: + code('opGblDat'+str(g_m+1)+'Device'+name+' = reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1:(blockIdx%x - 1)*('+dims[g_m]+') + ('+dims[g_m]+'))') + + + code('') + if ninds > 0: + IF('threadIdx%x - 1 .EQ. 0') + code('blockID = pblkMap(blockIdx%x - 1 + blockOffset)') + code('numberOfActiveThreads = pnelems(blockID)') + code('numberOfActiveThreadsCeiling = blockDim%x * (1 + (numberOfActiveThreads - 1) / blockDim%x)') + code('numOfColours = pnthrcol(blockID)') + code('threadBlockOffset = poffset(blockID)') + if stage_inc: + for g_m in range(0,ninds_staged): + code('ind_maps'+str(invinds_staged[g_m]+1)+'offset = ind_offs ('+str(g_m)+' + blockID * '+str(ninds_staged)+')') + code('ind_maps'+str(invinds_staged[g_m]+1)+'size = ind_sizes('+str(g_m)+' + blockID * '+str(ninds_staged)+')') + for g_m in range(0,ninds_staged): + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = ind_maps'+str(invinds_staged[g_m]+1)+'size * ('+inddims_staged[g_m]+')') + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = opDat'+str(invinds_staged[g_m]+1)+'RoundUp + MOD(opDat'+str(invinds_staged[g_m]+1)+'RoundUp,2)') + code('') + ENDIF() + + code('') + code('CALL syncthreads()') + code('') + + if stage_inc: + for g_m in range(0,ninds_staged): + if g_m>0 and indopts_staged[g_m-1] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m-1]])+')') + if g_m == 0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + prev_size = 0 + if 'real' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 8 + elif 'integer' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 4 + this_size = 0 + if 'real' in typs[invinds_staged[g_m]].lower(): + this_size = 8 + elif 'integer' in typs[invinds_staged[g_m]].lower(): + this_size = 4 + if this_size == 0 or prev_size == 0: + print("ERROR: Unrecognized type") + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)+' + opDat'+str(invinds_staged[g_m-1]+1)+'RoundUp * '+str(prev_size)+' / '+str(this_size)) + if g_m>0 and indopts_staged[g_m-1] > 0: + ELSE() + if g_m==0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)) + ENDIF() + + code('') + for g_m in range(0,ninds_staged): + code('') + code('i1 = threadIdx%x - 1') + if indopts_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m]])+')') + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size') + DO('i2','0', inddims_staged[g_m]) + if accs[invinds_staged[g_m]] == OP_READ or accs[invinds_staged[g_m]] == OP_RW or accs[invinds_staged[g_m]] == OP_WRITE: + if soaflags[invinds_staged[g_m]] == 1: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * soa_stride + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1))') + else: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1) * ('+inddims_staged[g_m]+'))') + elif accs[invinds_staged[g_m]] == OP_INC: + code('sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + ')) = 0') + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + if indopts_staged[g_m] > 0: + ENDIF() + code('') + code('') + code('CALL syncthreads()') + + code('') + code('i1 = threadIdx%x - 1') + code('') + + + DOWHILE('i1 < numberOfActiveThreadsCeiling') + if ind_inc or ind_rw: + code('colour2 = -1') + #-----Begin Indirect RW handling----- + if ind_rw: + DO('colour1','0','numOfColours') + IF('i1 < numberOfActiveThreads') + if permute: + code('i3 = pcol_reord(i1+threadBlockOffset)') + else: + code('i3 = i1') + k = [] + for g_m in range(0,nargs): + #workaround for optional arguments: its map may not exist either + if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): + k = k + [(optflags[g_m]*nargs+mapinds[g_m])] + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + ENDIF() + else: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + code('colour2 = pthrcol(i1 + threadBlockOffset)') + IF('colour2 .EQ. colour1') + #-----End Indirect RW handling----- + else: + IF('i1 < numberOfActiveThreads') + if permute: + code('i3 = pcol_reord(i1+threadBlockOffset)') + else: + code('i3 = i1') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): + k = k + [(optflags[g_m]*nargs+mapinds[g_m])] + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + ENDIF() + else: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i3 + threadBlockOffset + setSize * '+str(int(idxs[g_m])-1)+')') + + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Local = 0') + else: + DO('i2','0',dims[g_m]) + code('opDat'+str(g_m+1)+'Local(i2) = 0') + ENDDO() + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i2+1) = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + code(' & (1 + i2 * '+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i2+1) = opDat'+str(g_m+1)+'Device'+name+ ' &') + code(' & (1 + i2 * direct_stride_OP2CONSTANT + i3 + threadBlockOffset)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + + + code('') + comm('kernel call') + + else: + DO_STEP('i1','threadIdx%x - 1 + (blockIdx%x - 1) * blockDim%x','setSize','blockDim%x * gridDim%x') + code('') + comm('kernel call') + code('') + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i2+1) = opDat'+str(g_m+1)+'Device'+name+ ' &') + code(' & (1 + i2 * direct_stride_OP2CONSTANT + i1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + +########################################################################## +# CUDA kernel call +########################################################################## + if ninds > 0: #indirect kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif soaflags[g_m] == 1 and maps[g_m] != OP_GBL and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC):# and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + (i3 + threadBlockOffset))' + elif maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i3 + threadBlockOffset) * ('+dims[g_m]+') +1' + \ + ':(i3 + threadBlockOffset) * ('+dims[g_m]+') + ('+dims[g_m]+'))' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Device'+name+ \ + '((i3 + threadBlockOffset) * ('+dims[g_m]+') +1)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE):# and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+'):'+ \ + ' map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx)' + # elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==1: + # line = line +indent + '& opDat'+str(g_m+1)+'Opt' + elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('') + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ '(1 + i2 * '+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Device'+name+ '(1 + i2 * direct_stride_OP2CONSTANT + i3 + threadBlockOffset) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + #write optional/SoA arguments back from registers + # for g_m in range(0,nargs): + # if (accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + # IF('BTEST(optflags,'+str(optidxs[g_m])+')') + # if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + # DO('i2','0', dims[g_m]) + # code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ ' &') + # code(' & (1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Opt(i2)') + # ENDDO() + # else: + # code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + # '(1 + map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(g_m+1)+'Opt') + # ENDIF() + + if ind_inc and not ind_rw: + code('colour2 = pthrcol(i1 + threadBlockOffset)') + if not ind_rw: + ENDIF() + + if ind_inc or ind_rw: + if ind_inc and not ind_rw: + if stage_inc: + code('') + if ninds_staged > 0: + IF('colour2 .GE. 0') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0 and accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'SharedMap = mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset)') + if ninds_staged > 0: + ENDIF() + code('') + + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,nargs): + if optflags[g_m]==1 and maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if (not stage_inc) or inds_staged[g_m] == 0: + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local') + else: + if soaflags[g_m] == 1 and maps[g_m] != OP_GBL: + if dims[g_m].isdigit(): # and ('IFLUX_EDGE' in name or 'VFLUX_INCREMENT' in name): + for i in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local('+str(i)+') = opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + '+str(i)+'*'+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) + opDat'+str(g_m+1)+'Local('+str(i)+')') + for i in range(0,int(dims[g_m])): + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + '+str(i)+'*'+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(g_m+1)+'Local('+str(i)+')') + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*'+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2*'+get_stride_string_mapnames(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + else: + DO('i2','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx* ('+dims[g_m]+')) = &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Device'+name+ \ + '(1 + i2 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + else: + if dims[g_m].isdigit(): + if int(dims[g_m])==1: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local'+ \ + '+ sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + ( opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+')))') + code('sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + ( opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) = opDat'+str(g_m+1)+'Local') + else: + for i in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local('+str(i)+') = opDat'+str(g_m+1)+'Local('+str(i)+')'+ \ + '+ sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + ('+str(i)+' + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+')))') + for i in range(0,int(dims[g_m])): + code('sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + ('+str(i)+' + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) = opDat'+str(g_m+1)+'Local('+str(i)+')') + else: + DO('i2','0', dims[g_m]) + code('sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + (i2 + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) = &') + code('& sharedFloat8(opDat'+str(invinds[inds[g_m]-1]+1)+'nBytes + (i2 + opDat'+str(g_m+1)+'SharedMap * ('+dims[g_m]+'))) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + if optflags[g_m]!=1: + code('') + if optflags[g_m]==1 and maps[g_m]==OP_MAP and (accs[g_m] == OP_INC): + ENDIF() + code('') + ENDIF() + if ind_rw: + ENDIF() + if not stage_inc: + IF('colour1 .NE. numOfColours-1') + code('CALL syncthreads()') + if not stage_inc: + ENDIF() + ENDDO() + code('i1 = i1 + blockDim%x') + ENDDO() + code('') + + if stage_inc: + for g_m in range(0,ninds_staged): + if accs[invinds_staged[g_m]] == OP_INC: + if indopts_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m-1]])+')') + code('i1 = threadIdx%x - 1') + if soaflags[invinds_staged[g_m]] == 1: + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size') + if inddims_staged[g_m].isdigit(): + if int(inddims_staged[g_m])==1: + code('opDat'+str(invinds_staged[g_m]+1)+'Local = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + '+str(i)+' * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) + &') + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + '+str(i)+' * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) = opDat'+str(invinds_staged[g_m]+1)+'Local') + else: + for i in range(0,int(inddims_staged[g_m])): + code('opDat'+str(invinds_staged[g_m]+1)+'Local('+str(i)+') = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + '+str(i)+' * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) + &') + code('& sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + '+str(i)+' + i1 * ('+inddims_staged[g_m]+\ + '))') + for i in range(0,int(inddims_staged[g_m])): + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + '+str(i)+' * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) = opDat'+str(invinds_staged[g_m]+1)+'Local('+str(i)+')') + else: + DO('i2','0', inddims_staged[g_m]) + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(1 + i2 * '+get_stride_string_mapnames(invinds_staged[g_m],maps,mapnames,set_name)+' + ind_maps'+str(invinds_staged[g_m]+1)+\ + '(ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1)) + &') + code('& sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i2 + i1 * ('+inddims_staged[g_m]+\ + '))') + ENDDO() + else: + DOWHILE('i1 < ind_maps'+str(invinds_staged[g_m]+1)+'size * ('+inddims_staged[g_m]+')') + code('moduloResult = mod(i1,'+inddims_staged[g_m]+')') + code('opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds_staged[g_m]+1)+' &') + code('& (ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1 / ('+inddims_staged[g_m]+')) * ('+inddims_staged[g_m]+') + 1) = &') + code('& opDat'+str(invinds_staged[g_m]+1)+'Device'+name+'(moduloResult + ind_maps'+str(invinds_staged[g_m]+1)+' &') + code('& (ind_maps'+str(invinds_staged[g_m]+1)+'offset + i1 / ('+inddims_staged[g_m]+')) * ('+inddims_staged[g_m]+') + 1) + &') + code('& sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes + i1)') + code('i1 = i1 + blockDim%x') + ENDDO() + if indopts_staged[g_m] > 0: + ENDIF() + + else: #direct kernel call + line = ' CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif soaflags[g_m] == 1 and maps[g_m] != OP_GBL and (maps[g_m] != OP_MAP or accs[g_m] != OP_INC):# and optflags[g_m]==0: +# line = line +indent + '& opDat'+str(g_m+1)+'SoALocal' + line = line +indent + '& opDat'+str(g_m+1)+'Device'+name+ '(1 + i1)' + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE and dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name+'(1)' + else: + if (accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_INC) and g_m in needDimList: + line = line + indent +'& scratchDevice'+str(g_m+1)+'(thrIdx+1:)' + else: + line = line + indent +'& opGblDat'+str(g_m+1)+'Device'+name + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 + 1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Device'+name+'(i1 * ('+dims[g_m]+') + 1: i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + DO('i2','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Device'+name+ '(1 + i2 * direct_stride_OP2CONSTANT + i1) = &') + code(' & opDat'+str(g_m+1)+'Staged(i2+1) ') + ENDDO() + if optflags[g_m]==1: + ENDIF() + ENDDO() + + #call cuda reduction for each OP_GBL + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if accs[g_m] == OP_INC: + op = '0' + elif accs[g_m] == OP_MIN: + op = '1' + elif accs[g_m] == OP_MAX: + op = '2' + if 'real' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionFloat8(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionFloat8(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionFloat8Mdim(redFloat8, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + elif 'integer' in typs[g_m].lower(): + if dims[g_m].isdigit() and int(dims[g_m])==1: + code('CALL ReductionInt4(redInt4, reductionArrayDevice'+str(g_m+1)+name+'(blockIdx%x - 1 + 1:),opGblDat'+str(g_m+1)+'Device'+name+','+op+')') + else: + if g_m in needDimList: + code('do i1=0,'+dims[g_m]+'-1,1') + code(' CALL ReductionInt4(redInt4, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),scratchDevice'+str(g_m+1)+'(thrIdx+1+i1*(blockDim%x*gridDim%x)),'+op+')') + else: + code('do i1=0,'+dims[g_m]+'-1,8') + code('i2 = MIN(i1+8,'+dims[g_m]+')') + code(' CALL ReductionInt4Mdim(redInt4, reductionArrayDevice'+str(g_m+1)+name+'((blockIdx%x - 1)*('+dims[g_m]+') + 1+i1:),opGblDat'+str(g_m+1)+'Device'+name+'(i1:),'+op+',i2-i1)') + code('end do') + code('') + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate CPU hust stub +########################################################################## + + code('attributes (host) SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + IF('getHybridGPU().EQ.1') + code('CALL '+name+'_host_gpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + if hybrid == 1: + ELSE() + code('CALL '+name+'_host_cpu( userSubroutine, set, &'); + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + ENDIF() + depth = depth - 2 + code('END SUBROUTINE') + code('') + code('') + comm('Stub for GPU execution') + code('') + code('attributes (host) SUBROUTINE '+name+'_host_gpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + if util.const_list: + code('use HYDRA_CONST_MODULE') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('TYPE ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('TYPE ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + code('TYPE ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('') + code('') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(invinds[g_m]+1)+'Device'+name) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: opMap'+str(invinds[g_m]+1)+'Device'+name) + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: opDat'+str(g_m+1)+'Device'+name) + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Cardinality') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + elif maps[g_m] == OP_GBL: + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + + if ninds > 0: #indirect loop + code('TYPE ( op_plan ) , POINTER :: actualPlan_'+name+'') + code('') + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: mappingArray'+str(invinds[g_m]+1)+'Size') + code('') + + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + if stage_inc: + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), DEVICE, POINTER, DIMENSION(:) :: ind_maps'+str(invinds_staged[g_m]+1)) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), DEVICE, POINTER, DIMENSION(:) :: mappingArray'+str(g_m+1)) + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: ind_offs') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: ind_sizes') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: pnindirect') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nsharedCol') + code('TYPE ( c_ptr ), POINTER, DIMENSION(:) :: mappingArray') + code('TYPE ( c_ptr ), POINTER, DIMENSION(:) :: ind_maps') + + code('INTEGER(kind=4) :: pblkMapSize') + code('INTEGER(kind=4) :: poffsetSize') + code('INTEGER(kind=4) :: pnelemsSize') + code('INTEGER(kind=4) :: pnthrcolSize') + code('INTEGER(kind=4) :: pthrcolSize') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pblkMap') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: poffset') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pnelems') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pnthrcol') + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pthrcol') + if permute: + code('INTEGER(kind=4), DIMENSION(:), DEVICE, POINTER :: pcol_reord') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('') + + else: #direct loop + code('INTEGER(kind=4) :: blocksPerGrid') + code('INTEGER(kind=4) :: threadsPerBlock') + code('INTEGER(kind=4) :: dynamicSharedMemorySize') + code('INTEGER(kind=4) :: threadSynchRet') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i20') + code('REAL(kind=4) :: dataTransfer') + code('') + + code('INTEGER(kind=4), SAVE :: calledTimes=0') + code('INTEGER(kind=4) :: istat') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code(typs[g_m]+', DIMENSION(:), POINTER :: opDat'+str(g_m+1)+'Host') + else: + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Host') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' :: opDat'+str(g_m+1)+'Host_tmp') #XLF workaround + if (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + if g_m in needDimList: + code(typs[g_m]+', DIMENSION(:), DEVICE, POINTER :: scratchDevice'+str(g_m+1)) + code('INTEGER(kind=4) :: scratchDevice'+str(g_m+1)+'Size') + code('INTEGER(kind=4) :: reductionCardinality'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + #managing constants + if any_soa: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(calledTimes.EQ.0).OR.(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(g_m+1)+'))') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT = opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2HOST') + ENDIF() + if dir_soa!=-1: + IF('(calledTimes.EQ.0).OR.(direct_stride_OP2HOST.NE.getSetSizeFromOpArg(opArg'+str(dir_soa+1)+'))') + code('direct_stride_OP2HOST = getSetSizeFromOpArg(opArg'+str(dir_soa+1)+')') + code('direct_stride_OP2CONSTANT = direct_stride_OP2HOST') + ENDIF() + + #TODO: this is terrible + # for const in util.const_list: + # code(const+'_OP2CONSTANT = '+const) + + code('call op_timers_core(startTime)') + code('') + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize = getPartitionSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + #code('partitionSize = OP_PART_SIZE_ENV') + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + if permute: + code('& indirectionDescriptorArray,3)') + elif stage_inc: + code('& indirectionDescriptorArray,1)') + else: + code('& indirectionDescriptorArray,2)') + code('') + else: + code('') + if unknown_reduction_size: + code('blocksPerGrid = 100') + else: + code('blocksPerGrid = 600') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opMap'+str(invinds[g_m]+1)+'Cardinality = set%setPtr%size * getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + code('') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data_d,opDat'+str(invinds[g_m]+1)+'Device'+name+',(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data_d,opMap'+str(invinds[g_m]+1)+'Device'+name+',(/opMap'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data_d,opDat'+str(g_m+1)+'Device'+name+',(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if accs[g_m] == OP_WRITE or (not dims[g_m].isdigit()) or int(dims[g_m])>1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host,(/opDat'+str(g_m+1)+'Cardinality/))') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Host)') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('opDat'+str(g_m+1)+'Host_tmp = opDat'+str(g_m+1)+'Host') #XLF workaround + code('') + + if ninds > 0: + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk,(/set%setPtr%size/))') + code('pblkMapSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap_d,pblkMap,(/pblkMapSize/))') + code('poffsetSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%offset_d,poffset,(/poffsetSize/))') + code('pnelemsSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems_d,pnelems,(/pnelemsSize/))') + code('pnthrcolSize = actualPlan_'+name+'%nblocks') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,pnthrcol,(/pnthrcolSize/))') + code('pthrcolSize = set%setPtr%size') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,pthrcol,(/pthrcolSize/))') + if permute: + code('CALL c_f_pointer(actualPlan_'+name+'%col_reord,pcol_reord,(/pthrcolSize/))') + if stage_inc: + code('CALL c_f_pointer(actualPlan_'+name+'%nsharedCol,nsharedCol,(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nindirect,pnindirect,(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_maps,ind_maps,(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%maps,mappingArray,(/numberOfOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_sizes,ind_sizes,(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_offs,ind_offs,(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('') + for g_m in range(0,ninds_staged): + code('CALL c_f_pointer(ind_maps('+str(g_m+1)+'),ind_maps'+str(invinds_staged[g_m]+1)+',(/pnindirect('+str(g_m+1)+')/))') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('CALL c_f_pointer(mappingArray('+str(g_m+1)+'),mappingArray'+str(g_m+1)+',(/set%setPtr%size/))') + code('') + + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and ((accs[g_m]==OP_READ and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1)) or accs[g_m]==OP_WRITE): + IF('.not. allocated(opGblDat'+str(g_m+1)+'Device'+name+')') + code('allocate(opGblDat'+str(g_m+1)+'Device'+name+'(opArg'+str(g_m+1)+'%dim))') + ENDIF() + code('opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim) = opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim)') + if ninds>0 and reduct: + code('blocksPerGrid=0') + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = blocksPerGrid+ncolblk(i2+1)') + ENDDO() + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + #setup for reduction + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code('reductionCardinality'+str(g_m+1)+' = blocksPerGrid * 1') + code('allocate( reductionArrayHost'+str(g_m+1)+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + IF ('.not. allocated(reductionArrayDevice'+str(g_m+1)+name+')') + code('allocate( reductionArrayDevice'+str(g_m+1)+name+'(reductionCardinality'+str(g_m+1)+'* ('+dims[g_m]+')) )') + ENDIF() + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = 0.0') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = 0.0') + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('reductionArrayHost'+str(g_m+1)+'(i10+1) = opDat'+str(g_m+1)+'Host') + else: + code('reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')) = opDat'+str(g_m+1)+'Host') + ENDDO() + code('') + code('reductionArrayDevice'+str(g_m+1)+name+' = reductionArrayHost'+str(g_m+1)+'') + + code('') + if unknown_reduction_size: + if ninds>0: + code('blocksPerGrid = 0') + DO('i2','0','actualPlan_'+name+'%ncolors') + code('blocksPerGrid = MAX(blocksPerGrid,ncolblk(i2+1))') + ENDDO() + code('call prepareScratch(opArgArray,numberOfOpDats,blocksPerGrid*threadsPerBlock)') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN) and (g_m in needDimList): + code('scratchDevice'+str(g_m+1)+'Size = opArg'+str(g_m+1)+'%dim*blocksPerGrid*threadsPerBlock') + code('call c_f_pointer(opArgArray('+str(g_m+1)+')%data_d,scratchDevice'+str(g_m+1)+',(/scratchDevice'+str(g_m+1)+'Size/))') + + + #indirect loop host stub call + if ninds > 0: + code('blockOffset = 0') + code('') + code('threadsPerBlock = getBlockSize(userSubroutine//C_NULL_CHAR,set%setPtr%size)') + #code('threadsPerBlock = OP_PART_SIZE_ENV') + + DO('i2','0','actualPlan_'+name+'%ncolors') + IF('i2 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('blocksPerGrid = ncolblk(i2 + 1)') + if stage_inc: + code('dynamicSharedMemorySize = MAX(reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock, nsharedCol(1+i2))') + else: + code('dynamicSharedMemorySize = reductionSize(opArgArray,numberOfOpDats) * threadsPerBlock') + code('') + code('CALL op_cuda_'+name+' <<>> (&') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Device'+name+', &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opMap'+str(invinds[inds[g_m]-1]+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host_tmp, &') #XLF workaround + + if stage_inc: + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + + code('& ind_sizes, &') + code('& ind_offs, &') + code('& pblkMap, &') + code('& poffset,pnelems,pnthrcol,pthrcol, &') + if permute: + code('& pcol_reord,set%setPtr%size+set%setPtr%exec_size, blockOffset)') + else: + code('& set%setPtr%size+set%setPtr%exec_size, blockOffset)') + code('') + code('blockOffset = blockOffset + blocksPerGrid') + ENDDO() + code('') + else: #direct loop host stub call + if "UPDATEK" == name: + code('istat = cudaFuncSetCacheConfig(op_cuda_UPDATEK,cudaFuncCachePreferShared)') + code('CALL op_cuda_'+name+' <<>>( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Device'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if g_m in needDimList: + code('& scratchDevice'+str(g_m+1)+', &') + if accs[g_m] == OP_READ and dims[g_m].isdigit() and int(dims[g_m])==1: + code('& opDat'+str(g_m+1)+'Host_tmp, &') #XLF workaround + code('set%setPtr%size)') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. set%setPtr%core_size)') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('') + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_WRITE: + code('opDat'+str(g_m+1)+'Host(1:opArg'+str(g_m+1)+'%dim) = opGblDat'+str(g_m+1)+'Device'+name+'(1:opArg'+str(g_m+1)+'%dim)') + + if reduct: + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('reductionArrayHost'+str(g_m+1)+' = reductionArrayDevice'+str(g_m+1)+name+'') + code('') + DO('i10','0','reductionCardinality'+str(g_m+1)+'') + if accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = opDat'+str(g_m+1)+'Host + reductionArrayHost'+str(g_m+1)+'(i10+1)') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') + reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+'))') + elif accs[g_m] == OP_MIN: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MIN(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MIN(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + elif accs[g_m] == OP_MAX: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('opDat'+str(g_m+1)+'Host = MAX(opDat'+str(g_m+1)+'Host , reductionArrayHost'+str(g_m+1)+'(i10+1))') + else: + code('opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') = MAX(opDat'+str(g_m+1)+'Host(1:'+dims[g_m]+') , reductionArrayHost'+str(g_m+1)+'(i10 * ('+dims[g_m]+') + 1 : i10 * ('+dims[g_m]+') + ('+dims[g_m]+')))') + ENDDO() + code('') + if optflags[g_m] == 1: + ENDIF() + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') +# code('deallocate( reductionArrayDevice'+str(g_m+1)+' )') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + else: + print('Error, reduction type '+typs[g_m]+' unrecognised') + code('') + if optflags[g_m] == 1: + ENDIF() + + code('istat = cudaDeviceSynchronize()') + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('') + if hybrid == 1: + code('') + comm('Stub for CPU execution') + code('') +########################################################################## +# Generate OpenMP host stub +########################################################################## +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +############################################################################ +### Generate OpenMP host stub +############################################################################ + code('SUBROUTINE '+name+'_host_cpu( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('END SUBROUTINE') + code('END MODULE') +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'_'+name + fid = open(name+'_gpukernel.CUF','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_gpukernel.CUF','w') + else: + fid = open(name+'_kernel.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() + +########################################################################## +# Assemble Hydra master file +########################################################################## +def op2_gen_cuda_hydra(): + global dims, idxs, typs, indtyps, inddims + global file_format, cont, comment + global FORTRAN, CPP, g_m, file_text, depth, header_text, body_text + + file_text = '' + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + code('USE CUDAFOR') + code('USE CUDACONFIGURATIONPARAMS') + code('') + code('') + comm('Constant declarations') + code('#include "hydra_constants.inc"') + code('') + comm('Loop-specific global variables') + file_text += header_text + + code('') + code('CONTAINS') + code('') + code('#include "hydra_constants_set.inc"') + code('#include "flux_low_gpufun.inc"') + code('#include "bcs_kernels_gpufun.inc"') + code('#include "update_kernels_gpufun.inc"') + + file_text += body_text + code('END MODULE') + fid = open('hydra_kernels.CUF','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text) + fid.close() diff --git a/translator/fortran/op2_gen_mpiseq.py b/translator/fortran/op2_gen_mpiseq.py new file mode 100644 index 000000000..4f2748f22 --- /dev/null +++ b/translator/fortran/op2_gen_mpiseq.py @@ -0,0 +1,462 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_mpiseq(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + + code('') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra == 0: + comm('user function') + code('#include "'+name+'.inc"') + code('') + else: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + file_text += text + #code(kernels[nk]['mod_file']) + code('') + +########################################################################## +# Generate SEQ hust stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + + code('') + code('INTEGER(kind=4) :: i1') + + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) :: ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m] == 0 and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + code('') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + DO('i1','0','n_upper') + + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m] == 0 and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')') + +# if ninds > 0: +# IF('i1 .EQ. opSetCore%core_size') +# code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') +# ENDIF() + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+'):)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1:)') + ENDIF() + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + i1 * ('+dims[g_m]+') : i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + i1)' + if maps[g_m] == OP_MAP and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') : map'+str(mapinds[g_m]+1)+'idx * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_MAP and optflags[g_m]==1: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1:'+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1)' + if maps[g_m] == OP_GBL: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1:'+dims[g_m]+')' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + code('') + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + + + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime,0.00000_4,0.00000_4, 1)') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_seqkernel.F95','w') + else: + fid = open(name+'_seqkernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_mpiseq2.py b/translator/fortran/op2_gen_mpiseq2.py new file mode 100644 index 000000000..7a9d35a86 --- /dev/null +++ b/translator/fortran/op2_gen_mpiseq2.py @@ -0,0 +1,439 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_mpiseq2(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + + code('') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra == 0: + comm('user function') + code('#include "'+name+'.inc"') + code('') + else: + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + file_text += text + #code(kernels[nk]['mod_file']) + code('') + +########################################################################## +# Generate SEQ host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + + code('') + code('INTEGER(kind=4) :: i1') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + code('') + DO('i1','0','n_upper') + if ninds > 0: + IF('i1 .EQ. opSetCore%core_size') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+'):)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1:)') + ENDIF() + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + i1 * ('+dims[g_m]+') : i1 * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + i1)' + if maps[g_m] == OP_MAP and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+') : opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+'))' + elif maps[g_m] == OP_MAP and optflags[g_m]==1: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1:'+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1)' + if maps[g_m] == OP_GBL: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1:'+dims[g_m]+')' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + code('') + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + + + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime,0.00000,0.00000, 1)') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_seqkernel.F95','w') + else: + fid = open(name+'_seqkernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_mpiseq3.py b/translator/fortran/op2_gen_mpiseq3.py new file mode 100644 index 000000000..6e813b947 --- /dev/null +++ b/translator/fortran/op2_gen_mpiseq3.py @@ -0,0 +1,683 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import glob + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_mpiseq3(master, date, consts, kernels, hydra, bookleaf): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + grouped = 0 + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and (accs[i] == OP_INC or accs[i] == OP_RW): + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()):# and not (dims[g_m] in ['NPDE','DNTQMU','DNFCROW','1*1']): + needDimList = needDimList + [g_m] + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][9:]+'_module_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0 and bookleaf == 0: + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + + code('') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + + if hydra == 1: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'][9:]+'_module' + filename = 'kernels/'+kernels[nk]['master_file']+'_'+name+'.inc' + if not os.path.isfile(filename): + files = [f for f in glob.glob('kernels/*'+name+'.inc')] + if len(files)>0: + filename = files[0] + else: + print('kernel for '+name+' not found') +# modfile = modfile.replace('INIT_INIT','INIT') +# name2 = name.replace('INIT_INIT','INIT') +# filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name2 + '.F95' +# if not os.path.isfile(filename): +# filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' +# if not os.path.isfile(filename): +# filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name2[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + #text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + + # + # substitute npdes with DNPDE + # +# using_npdes = 0 +# for g_m in range(0,nargs): +# if var[g_m] == 'npdes': +# using_npdes = 1 +# if using_npdes: +# i = re.search('\\bnpdes\\b',text) +# j = i.start() +# i = re.search('\\bnpdes\\b',text[j:]) +# j = j + i.start()+5 +# i = re.search('\\bnpdes\\b',text[j:]) +# j = j + i.start()+5 +# text = text[1:j] + re.sub('\\bnpdes\\b','NPDE',text[j:]) +# + file_text += text + file_text += '\n#undef MIN\n' + #code(kernels[nk]['mod_file']) + elif bookleaf == 1: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += text[i:j]+'\n\n' + else: + comm('user function') + code('#include "'+name+'.inc"') + code('') + + code('') + +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top,argc,args,testfreq)') + code('implicit none') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Dim') + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local(opDat'+str(invinds[g_m]+1)+'Dim,*)') + else: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('INTEGER(kind=4) opDat'+str(g_m+1)+'Dim') + if maps[g_m] == OP_ID: + if g_m in needDimList: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local(opDat'+str(g_m+1)+'Dim,*)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + if g_m in needDimList: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local(opDat'+str(g_m+1)+'Dim)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1,argc,testfreq') + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: args') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') +# if ind_inc == 0 and reduct == 0: +# code('!DIR$ simd') + DO('i1','bottom','top') + IF('mod(i1,testfreq).eq.0') + if grouped: + code('call op_mpi_test_all_grouped(argc,args)') + else: + code('call op_mpi_test_all(argc,args)') + ENDIF() + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate SEQ host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + + code('') + code('INTEGER(kind=4) :: i1, testfreq') + code('REAL(kind=4) :: dataTransfer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('testfreq = op_mpi_get_test_frequency()') + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + if grouped: + code('n_upper = op_mpi_halo_exchanges_grouped(set%setCPtr,numberOfOpDats,opArgArray,1)') + else: + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + code('') + if 1: + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& 0, opSetCore%core_size,numberOfOpDats,opArgArray,testfreq)') + if grouped: + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,1)') + else: + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + #code('& 0, n_upper)') + code('& opSetCore%core_size, n_upper,numberOfOpDats,opArgArray,2147483647)') + + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + if grouped: + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,1)') + else: + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + + + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if optflags[g_m] == 1: + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + else: + print('Error, reduction type '+typs[g_m]+' unrecognised') + code('') + else: + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + else: + print('Error, reduction type '+typs[g_m]+' unrecognised') + code('') + + code('call op_timers_core(endTime)') + code('') + code('dataTransfer = 0.0') + if ninds == 0: + for g_m in range(0,nargs): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * opSetCore%size') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * opSetCore%size * 2.d0') + if optflags[g_m] == 1: + ENDIF() + else: + names = [] + for g_m in range(0,ninds): + mult='' + if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: + mult = ' * 2.d0' + if not var[invinds[g_m]] in names: + if optflags[invinds[g_m]] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('dataTransfer = dataTransfer + opArg'+str(invinds[g_m]+1)+'%size * MIN(n_upper,getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+'))'+mult) + names = names + [var[invinds[g_m]]] + if optflags[invinds[g_m]] == 1: + ENDIF() + for g_m in range(0,nargs): + mult='' + if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: + mult = ' * 2.d0' + if not var[g_m] in names: + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + names = names + [var[invinds[g_m]]] + if maps[g_m] == OP_ID: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * MIN(n_upper,getSetSizeFromOpArg(opArg'+str(g_m+1)+'))'+mult) + elif maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size'+mult) + if optflags[g_m] == 1: + ENDIF() + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('dataTransfer = dataTransfer + n_upper * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim * 4.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + #code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + #code('& endTime-startTime,0.00000,0.00000, 1)') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'_'+name + fid = open(name+'_seqkernel.F90','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_seqkernel.f90','w') + else: + fid = open(name+'_seqkernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_mpivec.py b/translator/fortran/op2_gen_mpivec.py new file mode 100644 index 000000000..5b0f25578 --- /dev/null +++ b/translator/fortran/op2_gen_mpivec.py @@ -0,0 +1,1228 @@ +########################################################################## +# +# MPI+Vectorized seq code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_veckernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import glob +import util + +def comm(line): + global file_text, FORTRAN + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +=prefix+'! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO2(i,start,finish,step): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, '+step) + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def DO3(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+', 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + +def para_parse(text, j, op_b, cl_b): + """Parsing code block, i.e. text to find the correct closing brace""" + + depth = 0 + loc2 = j + + while 1: + if text[loc2] == op_b: + depth = depth + 1 + + elif text[loc2] == cl_b: + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +arg_parse=util.arg_parse + +def op2_gen_mpivec(master, date, consts, kernels, hydra, bookleaf): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + typestrigs = ['INTEGER','INT','REAL','DOUBLE','CHAR','FLOAT' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + + for g_m in range(0,nargs): + dims[g_m] = dims[g_m].replace('NPDE','6') + dims[g_m] = dims[g_m].replace('DNTQMU','3') + dims[g_m] = dims[g_m].replace('DNFCROW','3') + dims[g_m] = dims[g_m].replace('DMAXZONE','500') + + needDimList = [] + for g_m in range(0,nargs): + try: + dims[g_m] = str(eval(dims[g_m])) + except: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_MAP: + dims[g_m] = 'opDat'+str(inds[g_m])+'Dim' + else: + dims[g_m] = 'opDat'+str(g_m+1)+'Dim' + +# if (not dims[g_m].isdigit()) and not (dims[g_m] in ['NPDE','DNTQMU','DNFCROW']): +# needDimList = needDimList + [g_m] +# +# set three logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP : + j = i + indirect_kernel = j > -1 + + if bookleaf: + for i in range(0,nargs): + if 'LI' in dims[i]: + dims[i] = dims[i].replace('LI','100') + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## +# if hydra : +# if indirect_kernel: +# #if name <> 'VFLUX_EDGEF': #'ACCUMEDGES': +# #print "skipping indirect kernel :", name +# continue +# elif name <> 'GRAD_VOLAPF': #UPDATEK - problems with op_wirtes, SRCSA_NODE +# print "skipping unspecified kernel :", name +# continue + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + code('') + +#################################################################################### +# generate the user kernel function - creating versions for vectorisation as needed +#################################################################################### + code('') + code('CONTAINS') + code('') + if 1:#hydra == 0: +# +# First original version +# + funcall_in_kernel = 0 + comm('user function') + if bookleaf: + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 12 + len(name) + kernel_text = text[i:j] + elif hydra: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + p = re.compile('SUBROUTINE\\s+\\b'+name+'\\b',re.IGNORECASE) + i = p.search(text).start() + p = re.compile('END\\s+SUBROUTINE\\s+.*\\n',re.IGNORECASE) + j = p.search(text).end() + text= text[i:j] + #get rid of #ifdef COMPEX + i = text.find('#ifdef COMPLEX') + while i>=0: + j = i+text[i:].find('#else') + k = i+text[i:].find('#endif') + #if there is a nested macro, bail + l = i+text[i:].find('#if') + if l>i and l < k: + print('Nested macro in '+name+'prevents parsing for vectorization') + exit(-1) + #otherwise remove the body + if j>i and j -1: + current_type = bl[:colons+2] + else: + idx=0 + while not bl[idx].isspace() and not bl[idx]=='(': + idx = idx+1 + if bl[idx] == '(': + idx = arg_parse(bl,idx) + current_type = bl[:idx+1] + else: + current_type = '' + + if current_type != '': + last_bl_idx=bl_idx + idx = len(current_type) + prev_idx = len(current_type) + curr_dim = '' + while idx < len(bl): + if bl[idx].isspace(): + idx = idx+1 + if len(bl[prev_idx:idx].strip())==0: + prev_idx = idx + elif bl[idx] == '(': + idx2 = arg_parse(bl,idx) + dimlist.append(bl[idx+1:idx2]) + idx = idx2+1 + typelist.append(current_type) + varlist.append(bl[prev_idx:idx]) + prev_idx = idx + elif bl[idx] == ',': + if len(bl[prev_idx:idx].strip()): + typelist.append(current_type) + varlist.append(bl[prev_idx:idx].strip()) + dimlist.append(curr_dim) + idx = idx + 1 + prev_idx = idx + elif idx == len(bl)-1: + idx = idx + 1 + typelist.append(current_type) + varlist.append(bl[prev_idx:idx].strip()) + dimlist.append(curr_dim) + else: + idx = idx+1 + bl_idx = bl_idx + 1 + + depth = depth - 2 + for i in range(0,len(varlist)): + for j in range(0,len(para)): + if (not para[j] == 'DIRECT') and (not re.search(r'\b'+para[j]+r'\b', varlist[i]) == None): + if dimlist[i] != '': + dimsstr=dimlist[i] + elif j in needDimList: + dimsstr = '*' + else: + dimsstr = dims[j] + if maps[j] == OP_MAP: + if (accs[j] == OP_INC or accs[j] == OP_RW or accs[j] == OP_WRITE): + typelist[i] = typs[j]+', DIMENSION(SIMD_VEC,'+dimsstr+') ::' + varlist[i] = para[j] + if (accs[j] == OP_READ): + typelist[i] = typs[j]+', DIMENSION(SIMD_VEC,'+dimsstr+'), INTENT(IN) ::' + varlist[i] = para[j] + elif maps[j] == OP_GBL: + if ninds>0: + typelist[i] = typs[j]+', DIMENSION(SIMD_VEC,'+dimsstr+') ::' + else: + typelist[i] = typs[j]+', DIMENSION('+dimsstr+') ::' + varlist[i] = para[j] + + types_inserted = 0 + bl_idx = 0 + while bl_idx < len(body_lines): + bl = body_lines[bl_idx].strip() + if any((re.search(r'\b'+typestr+r'\b',bl.lower()) != None and \ + re.search(r'\b'+typestr+r'\b',bl.lower()).start()==0) for typestr in typestr_list): + while bl.find('&') != -1: #TODO: commented out &? + bl_idx = bl_idx + 1 + if body_lines[bl_idx].strip().find('#') == 0: + bl_idx = bl_idx + 1 + bl = bl[:bl.find('&')] + body_lines[bl_idx].strip()[1:] + if types_inserted == 0: + for i in range(0,len(varlist)): + code(' '+typelist[i]+' '+varlist[i]) + types_inserted = 1 + bl_idx = bl_idx + 1 + continue + temp = body_lines[bl_idx] + for p in range(0,len(para)): + temp = re.sub(r'(\b'+para[p]+r'\b\s*'+'\('+')', r'\1'+'idx,', temp) + #TODO: only if dim is 1, otherwise vector op - fail? + temp = re.sub(r'\b'+para[p]+r'\b(\s*\n|[^\(])', para[p]+'(idx,1)'+r'\1', temp+'\n')[:-1] + kernel_line = temp + if bl_idx == len(body_lines)-1: + kernel_line = kernel_line.lower().replace(name.lower(),name+'_vec') + if hydra and types_inserted==1: + kernel_line = re.sub('\\bnpdes\\b','NPDE',kernel_line) + code(kernel_line) + bl_idx = bl_idx + 1 + + + code('#endif') + + ### Hydra specific user kernel #### should not be in code generator .. to be fixed + else: + + # + # First original version + # + + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'][4:] + #print modfile + modfile = modfile.replace('INIT_INIT','INIT') + name2 = name.replace('INIT_INIT','INIT') + #print modfile + file_name = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name2 + '.F95' + if not os.path.isfile(file_name): + file_name = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(file_name): + file_name = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name2[:-1] + '.F95' + fid = open(file_name, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + file_text += text + #code('#define SIMD_VEC 8') + code('#define SIMD_VEC 4') + + # + # Modified vectorisable version if its an indirect kernel + # - direct kernels can be vectorised without modification + # + + if indirect_kernel: + code('#ifdef VECTORIZE') + comm('user function -- modified for vectorisation') + kernel_text = text + p = re.compile('SUBROUTINE\\s+\\b'+name+'\\b',re.IGNORECASE) + i = p.search(kernel_text).start() + + if(i < 0): + print("\n********") + print("Error: cannot locate user kernel function name: "+name+" - Aborting code generation") + exit(2) + i2 = i + j = kernel_text[i:].find('(') + k = para_parse(kernel_text, i+j, '(', ')') + l = kernel_text[k:].find('END'+'\\s+\\b'+'SUBROUTINE') + para = kernel_text[j+1:k].split(',') + #remove direct vars from para + para_ind = [] + for i in range(0,nargs): + if maps[i] == OP_ID: + para[i] = 'DIRECT' + + code('SUBROUTINE '+name+'_vec('+kernel_text[j+1:k]+',idx)') + depth = depth + 2 + code('!dir$ attributes vector :: '+name+'_vec') + code('IMPLICIT NONE') + print(needDimList) + for g_m in range(0,nargs): + if g_m in needDimList: + dimsstr = '*' + else: + dimsstr = '(DIMS)' + if maps[g_m] == OP_MAP: + if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): + code('TYP, DIMENSION(SIMD_VEC,'+dimsstr+') :: '+para[g_m]) + if (accs[g_m] == OP_READ): + code('TYP, DIMENSION(SIMD_VEC,'+dimsstr+'), INTENT(IN) :: '+para[g_m]) + elif maps[g_m] == OP_GBL: + code('TYP DIMENSION('+dimsstr+') :: '+para[g_m]) + code('INTEGER(4) :: idx') + + + code('') + +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + code('implicit none') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Dim') + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local(opDat'+str(invinds[g_m]+1)+'Dim,*)') + else: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('INTEGER(kind=4) opDat'+str(g_m+1)+'Dim') + if maps[g_m] == OP_ID: + if g_m in needDimList: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local(opDat'+str(g_m+1)+'Dim,*)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + if g_m in needDimList: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local(opDat'+str(g_m+1)+'Dim)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1, i2') + if len(needDimList)>0: + code('INTEGER(KIND=4) i3') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + + #vars for globals - used when called with vectorisation + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_WRITE\ + or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + code('TYP dat'+str(g_m+1)+'(SIMD_VEC*'+dims[g_m]+')') + code('!dir$ attributes align: 64:: dat'+str(g_m+1)) + code('') + +# +# kernel call for indirect version +# + #If indirect kernel then add vector gather/scatter variables + if indirect_kernel: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \ + or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \ + or accs[g_m] == OP_INC): + code('TYP dat'+str(g_m+1)+'(SIMD_VEC,DIMS)') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ \ + or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE \ + or accs[g_m] == OP_INC): + code('!dir$ attributes align: 64:: dat'+str(g_m+1)) + code('') + + for g_m in range(0,ninds): + code('!DIR$ ASSUME_ALIGNED opDat'+str(invinds[g_m]+1)+'Local : 64') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('!DIR$ ASSUME_ALIGNED opDat'+str(g_m+1)+'Local : 64') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('!DIR$ ASSUME_ALIGNED opDat'+str(invinds[inds[g_m]-1]+1)+'Map : 64') + + + code_pre('#ifdef VECTORIZE') + DO2('i1','bottom','((top-1)/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + code('!DIR$ SIMD') + DO3('i2','1','SIMD_VEC') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + (i1+i2-1) * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') + 1') + + code('') + + #setup gathers + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW):#and (not mapinds[g_m] in k): + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if g_m in needDimList: + DO3('i3','1',dims[g_m]) + code('dat'+str(g_m+1)+'(i2,i3) = opDat'+\ + str(invinds[inds[g_m]-1]+1)+'Local(i3,map'+str(mapinds[g_m]+1)+'idx)') + ENDDO() + else: + for d in range(0,int(eval(dims[g_m]))): + code('dat'+str(g_m+1)+'(i2,'+str(d+1)+') = opDat'+\ + str(invinds[inds[g_m]-1]+1)+'Local('+str(d+1)+',map'+str(mapinds[g_m]+1)+'idx)') + if optflags[g_m]==1: + ENDIF() + code('') + elif (accs[g_m] == OP_INC): + code('dat'+str(g_m+1)+'(i2,:) = 0.0') + ENDDO() + + #initialize globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('dat'+str(g_m+1)+' = 0.0_8') + elif accs[g_m] == OP_MAX: + code('dat'+str(g_m+1)+' = -HUGE(dat'+str(g_m+1)+')') + elif accs[g_m] == OP_MIN: + code('dat'+str(g_m+1)+' = HUGE(dat'+str(g_m+1)+')') + + #vectorized kernel call + code('!DIR$ SIMD') + code('!DIR$ FORCEINLINE') + DO3('i2','1','SIMD_VEC') + comm('vectorized kernel call') + line = 'CALL '+name+'_vec( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,(i1+i2-1)+1), &' + if maps[g_m] == OP_MAP: + line = line +indent + '& dat'+str(g_m+1)+', &' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1), &' + elif dims[g_m].isdigit() and eval(dims[g_m]) == 1: + line = line + indent +'& dat'+str(g_m+1)+'(i2), &' + else: + line = line + indent +'& dat'+str(g_m+1)+'((DIMS)*(i2-1)+1:(DIMS)*(i2-1)+(DIMS)), &' + line = line + indent +'& i2)' + code(line) + ENDDO() + + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + leftarg = 'opDat'+str(g_m+1)+'Local(1:(DIMS))' + rightarg = 'dat'+str(g_m+1)+'((DIMS)*(i2-1)+1:(DIMS)*(i2-1)+(DIMS))' + if accs[g_m] == OP_INC: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = '+leftarg+' + '+rightarg) + ENDDO() + elif accs[g_m] == OP_MAX: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = MAX('+leftarg+' , '+rightarg+')') + ENDDO() + elif accs[g_m] == OP_MIN: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = MIN('+leftarg+' , '+rightarg+')') + ENDDO() + #do the scatters + DO3('i2','1','SIMD_VEC') + if nmaps > 0: + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if (accs[g_m] == OP_INC or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE): + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + (i1+i2-1) * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') + 1') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP : + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (accs[g_m] == OP_INC ): + if g_m in needDimList: + DO3('i3','1',dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local(i3,map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local(i3,map'+str(mapinds[g_m]+1)+'idx) + dat'+str(g_m+1)+'(i2,i3)') + ENDDO() + else: + for d in range(0,int(eval(dims[g_m]))): + code('opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local('+str(d+1)+',map'+str(mapinds[g_m]+1)+'idx) = opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local('+str(d+1)+',map'+str(mapinds[g_m]+1)+'idx) + dat'+str(g_m+1)+'(i2,'+str(d+1)+')') + code('') + if (accs[g_m] == OP_WRITE or accs[g_m] == OP_RW): + if g_m in needDimList: + DO3('i3','1',dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local(i3,map'+str(mapinds[g_m]+1)+'idx) = dat'+str(g_m+1)+'(i2,i3)') + ENDDO() + else: + for d in range(0,int(eval(dims[g_m]))): + code('opDat'+str(invinds[inds[g_m]-1]+1)+\ + 'Local('+str(d+1)+',map'+str(mapinds[g_m]+1)+'idx) = dat'+str(g_m+1)+'(i2,'+str(d+1)+')') + code('') + if optflags[g_m]==1: + ENDIF() + + + + ENDDO() + + + #do reductions + #TODO -- need exmple code + +# +# kernel call for direct version +# + else: + code_pre('#ifdef VECTORIZE') + DO2('i1','bottom','((top-1)/SIMD_VEC)*SIMD_VEC','SIMD_VEC') + #initialize globals + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('dat'+str(g_m+1)+' = 0.0_8') + elif accs[g_m] == OP_MAX: + code('dat'+str(g_m+1)+' = -HUGE(dat'+str(g_m+1)+')') + elif accs[g_m] == OP_MIN: + code('dat'+str(g_m+1)+' = HUGE(dat'+str(g_m+1)+')') + #vectorized kernel call + code('!DIR$ SIMD') + code('!DIR$ FORCEINLINE') + DO3('i2','1','SIMD_VEC') + comm('vectorized kernel call') + line = ' CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,(i1+i2-1)+1)' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_READ: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + elif dims[g_m].isdigit() and eval(dims[g_m]) == 1: + line = line + indent +'& dat'+str(g_m+1)+'(i2)' + elif accs[g_m] != OP_READ: + line = line + indent +'& dat'+str(g_m+1)+'('+dims[g_m]+\ + '*(i2-1)+1:'+dims[g_m]+'*(i2-1)+'+dims[g_m]+')' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + code('') + + #do reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + leftarg = 'opDat'+str(g_m+1)+'Local(1:(DIMS))' + rightarg = 'dat'+str(g_m+1)+'((DIMS)*(i2-1)+1:(DIMS)*(i2-1)+(DIMS))' + if accs[g_m] == OP_INC: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = '+leftarg+' + '+rightarg) + ENDDO() + elif accs[g_m] == OP_MAX: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = MAX('+leftarg+' , '+rightarg+')') + ENDDO() + elif accs[g_m] == OP_MIN: + DO3('i2','1','SIMD_VEC') + code(leftarg+' = MIN('+leftarg+' , '+rightarg+')') + ENDDO() + + + ENDDO()# end of SIMD_VEC length strided loop +# +# remainder of loop +# + + comm('remainder') + DO('i1','((top-1)/SIMD_VEC)*SIMD_VEC','top') + depth = depth - 2 + code_pre('#else') + code('!DIR$ FORCEINLINE') + DO('i1','bottom','top') + depth = depth - 2 + code_pre('#endif') + depth = depth + 2 + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = ' CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate SEQ host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('REAL(kind=4) :: dataTransfer') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + + code('') + code('INTEGER(kind=4) :: i1') + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + code('') + if 0: + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& 0, opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& 0, n_upper)') +# code('& opSetCore%core_size, n_upper)') + + +# IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') +# code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') +# ENDIF() +# code('') + + + + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + code('dataTransfer = 0.0') + if ninds == 0: + for g_m in range(0,nargs): + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * opSetCore%size') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * opSetCore%size * 2.d0') + if optflags[g_m] == 1: + ENDIF() + else: + names = [] + for g_m in range(0,ninds): + mult='' + if indaccs[g_m] != OP_WRITE and indaccs[g_m] != OP_READ: + mult = ' * 2.d0' + if not var[invinds[g_m]] in names: + if optflags[invinds[g_m]] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('dataTransfer = dataTransfer + opArg'+str(invinds[g_m]+1)+'%size * MIN(n_upper,getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+'))'+mult) + names = names + [var[invinds[g_m]]] + if optflags[invinds[g_m]] == 1: + ENDIF() + for g_m in range(0,nargs): + mult='' + if accs[g_m] != OP_WRITE and accs[g_m] != OP_READ: + mult = ' * 2.d0' + if not var[g_m] in names: + if optflags[g_m] == 1: + IF('opArg'+str(g_m+1)+'%opt == 1') + names = names + [var[invinds[g_m]]] + if maps[g_m] == OP_ID: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * MIN(n_upper,getSetSizeFromOpArg(opArg'+str(g_m+1)+'))'+mult) + elif maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size'+mult) + if optflags[g_m] == 1: + ENDIF() + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('dataTransfer = dataTransfer + n_upper * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim * 4.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + code('') + #code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + #code('& endTime-startTime,0.00000,0.00000, 1)') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + #fid = open(name+'_seqkernel.F95','w') + fid = open(name+'_veckernel.F95','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_veckernel.F90','w') + else: + #fid = open(name+'_seqkernel.F90','w') + fid = open(name+'_veckernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2_fortran.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openacc.py b/translator/fortran/op2_gen_openacc.py new file mode 100644 index 000000000..05bc54bf1 --- /dev/null +++ b/translator/fortran/op2_gen_openacc.py @@ -0,0 +1,1046 @@ +########################################################################## +# +# OpenACC code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_acckernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import util + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +arg_parse=util.arg_parse +replace_consts=util.replace_consts +replace_npdes=util.replace_npdes +get_stride_string=util.get_stride_string +replace_soa = util.replace_soa +find_function_calls=util.find_function_calls + + +def op2_gen_openacc(master, date, consts, kernels, hydra,bookleaf): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + stage_soa = nopts + stage_flags=[0]*nargs; + host_exec = 0 + + for g_m in range(0,nargs): + if 'NPDE' in dims[g_m]: + dims[g_m] = dims[g_m].replace('NPDE','6') + try: + newdim = str(eval(dims[g_m])) + dims[g_m] = newdim + except NameError as inst: + dims[g_m] + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not dims[g_m].isdigit(): + print('WARNING: unknown dimension reduction argument '+str(g_m)+' in '+name+': host sequential execution') + host_exec = 1 + for i in range(0,nargs): + soaflags[i] = 0 +# for g_m in range(0,nargs): +# if dims[g_m] == 'NPDE': +# dims[g_m] = '6' + + if 'UPDATE_EXPK' in name: + host_exec=1 + + if host_exec: + for i in range(0,nargs): + soaflags[i] = 0 + + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + unknown_reduction_size = 0 + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()): + found=0 + for string in ['NPDE','DNTQMU','DNFCROW','1*1']: + if string in dims[g_m]: + found=1 + if found==0: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + unknown_reduction_size = 1 + soaflags[g_m] = 1 + is_soa = 1 + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + + code('') + if bookleaf==0: + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + if bookleaf==0: + code('#endif') + + + code('') + + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + #strides for SoA + if any_soa and not host_exec: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT') + code('!$acc declare create(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT)') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('INTEGER(kind=4) :: direct_stride_OP2CONSTANT') + code('!$acc declare create(direct_stride_OP2CONSTANT)') + dir_soa = g_m + break + + code('') + + if is_soa > -1: + code('#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','subroutine') + code('') + #remove all comments + util.const_list = [] + text = re.sub('!.*\n','\n',text) +# if not host_exec: +# text = replace_consts(text) + + text = text.replace('subroutine '+name, 'subroutine '+name+'_gpu') + + if not host_exec: + text = text.replace(')\n',')\n!$acc routine seq\n',1) + using_npdes = 0 + for g_m in range(0,nargs): + if var[g_m] == 'npdes': + using_npdes = 1 + if using_npdes==1: + text = replace_npdes(text) + + if not host_exec: + #find subroutine calls + util.funlist = [name.lower()] + plus_kernels = find_function_calls(text,'') + + if plus_kernels == '': + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + + text = text + '\n' + plus_kernels + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) + code('!$acc routine('+fun+'_gpu)') + + if plus_kernels != '': + print(name) + for i in range(0,nargs): + if soaflags[i]==1 and not (maps[i] ==OP_GBL): + stage_flags[i] = 1; + stage_soa = 1 + + #strip "use" statements + i = re.search('\\buse\\b',text.lower()) + i_offset = 0 + while not (i is None): + i_offset = i_offset+i.start() + if not ('HYDRA_CONST_MODULE' in text[i_offset:i_offset+23]): + text = text[0:i_offset]+'!'+text[i_offset:] + i_offset = i_offset+4 + i = re.search('\\buse\\b',text[i_offset:].lower()) + + + file_text += text + #code(kernels[nk]['mod_file']) + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + text = text[i:j]+'\n\n' + text = re.sub(r'subroutine\s*'+name, r'subroutine '+name+'_gpu',text,2,re.IGNORECASE) + if not host_exec: + text = text.replace(')\n',')\n!$acc routine seq\n',1) + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,0,hydra,bookleaf) + file_text += text + else: + comm('user function') + fid = open(name+'.inc', 'r') + text = fid.read() + text = re.sub(r'subroutine\s*'+name, r'subroutine '+name+'_gpu',text,1,re.IGNORECASE) + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + text = text.replace(')\n',')\n!$acc routine seq\n',1) + code(text) + + + code('') + +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + if ninds > 0: + code('& col_reord, set_size, &') + code('& bottom,top)') + + code('implicit none') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Dim') + if soaflags[invinds[g_m]]: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local(*)') + else: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('INTEGER(kind=4) opDat'+str(g_m+1)+'Dim') + if maps[g_m] == OP_ID: + if soaflags[g_m]: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local(*)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + if accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE and dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + if ninds>0: + code('INTEGER(kind=4) col_reord(*)') + code('INTEGER(kind=4) set_size') + + if not host_exec: + #when functions call functions, we can no longer reliably do SoA, therefore we need to stage everything in registers + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if g_m in needDimList: + print('Error, cannot statically determine dim of argument '+str(g_m+1)+' in kernel '+name) + sys.exit(-1) + code(typs[g_m]+', DIMENSION('+dims[g_m]+') :: opDat'+str(g_m+1)+'Staged') + + code('') + + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code(typs[g_m]+' opDat'+str(g_m+1)+'Local_'+str(d+1)) + code(typs[g_m]+' opDat'+str(g_m+1)+'LocalArr('+dims[g_m]+')') + + + code('INTEGER(kind=4) bottom,top,i1,i2') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + if stage_soa>0: + code('INTEGER(kind=4) i3') + + code('') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = opDat'+str(g_m+1)+'Local('+str(d+1)+')') + + code('') + + line = '!$acc parallel loop independent gang vector &\n' + for g_m in range(0,ninds): + line = line + '!$acc& deviceptr(opDat'+str(invinds[g_m]+1)+'Local) &\n' + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + '!$acc& deviceptr(opDat'+str(g_m+1)+'Local) &\n' + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + line = line + '!$acc& deviceptr(opDat'+str(invinds[inds[g_m]-1]+1)+'Map) &\n' + if ninds > 0: + line = line + '!$acc& deviceptr(col_reord) private(i1) &\n' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line = line + '!$acc& private(map'+str(mapinds[g_m]+1)+'idx) &\n' + + + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + if int(dims[g_m])==1: + if accs[g_m] == OP_INC: + line = line + '!$acc& reduction(+:'+'opDat'+str(g_m+1)+'Local) &\n' + if accs[g_m] == OP_MIN: + line = line + '!$acc& reduction(min:'+'opDat'+str(g_m+1)+'Local) &\n' + if accs[g_m] == OP_MAX: + line = line + '!$acc& reduction(max:'+'opDat'+str(g_m+1)+'Local) &\n' + else: + for d in range(0,int(dims[g_m])): + if accs[g_m] == OP_INC: + line = line + '!$acc& reduction(+:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if accs[g_m] == OP_MIN: + line = line + '!$acc& reduction(min:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if accs[g_m] == OP_MAX: + line = line + '!$acc& reduction(max:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if stage_soa>0: + line = line + '!$acc& private(i3) &\n' + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + '!$acc& private(opDat'+str(g_m+1)+'Staged) &\n' + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + line = line + '!$acc& private(opDat'+str(g_m+1)+'LocalArr) &\n' + line = line[:-2] + if not host_exec: + code(line) + + if ninds > 0 and not host_exec: + DO('i2','bottom','top') + code('i1 = col_reord(i2+1)') + else: + DO('i1','bottom','top') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'LocalArr = 0') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'LocalArr = HUGE(opDat'+str(g_m+1)+'Local_1)') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'LocalArr = -HUGE(opDat'+str(g_m+1)+'Local_1)') + + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): + k = k + [(optflags[g_m]*nargs+mapinds[g_m])] + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if host_exec: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + else: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + set_size * '+str(int(idxs[g_m])-1)+')+1') + if optflags[g_m]==1: + ENDIF() + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i3+1) = opDat'+str(invinds[inds[g_m]-1]+1)+'Local &') + code(' & (i3 * '+get_stride_string(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx)') + ENDDO() + else: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i3+1) = opDat'+str(g_m+1)+'Local &') + code(' & (1 + i3 * direct_stride_OP2CONSTANT + i1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + + comm('kernel call') + line = 'CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif maps[g_m] == OP_ID: + if soaflags[g_m]: + line = line + indent + '& opDat'+str(g_m+1)+'Local(i1+1)' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + elif maps[g_m] == OP_MAP: + if soaflags[g_m]: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_GBL: + if accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line + indent +'& opDat'+str(g_m+1)+'Local' + else: + if host_exec: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'LocalArr' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + code(line + indent + '& )') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = opDat'+str(g_m+1)+'Local_'+str(d+1)+' + opDat'+str(g_m+1)+'LocalArr('+str(d+1)+')') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = MIN(opDat'+str(g_m+1)+'Local_'+str(d+1)+', opDat'+str(g_m+1)+'LocalArr('+str(d+1)+'))') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = MAX(opDat'+str(g_m+1)+'Local_'+str(d+1)+', opDat'+str(g_m+1)+'LocalArr('+str(d+1)+'))') + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i3','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Local(i3 * '+get_stride_string(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = &') + code(' & opDat'+str(g_m+1)+'Staged(i3+1)') + ENDDO() + else: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Local(1 + i3 * direct_stride_OP2CONSTANT + i1) = &') + code(' & opDat'+str(g_m+1)+'Staged(i3+1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + depth = depth + 2 + + depth = depth - 2 + ENDDO() + if not host_exec: + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local('+str(d+1)+') = opDat'+str(g_m+1)+'Local_'+str(d+1)) + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4), SAVE :: calledTimes=0') + + if ninds > 0: #if indirect loop + code('INTEGER(kind=4) :: exec_size') + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: col_reord_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + code(typs[g_m]+', ALLOCATABLE, DIMENSION(:) :: opDat'+str(g_m+1)+'LocalReduction') + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + #managing constants + if any_soa and not host_exec: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(calledTimes.EQ.0).OR.(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT.NE.getSetSizeFromOpArg(opArg'+str(g_m+1)+'))') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT = getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + code('!$acc update device(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT)') + ENDIF() + if dir_soa!=-1: + IF('(calledTimes.EQ.0).OR.(direct_stride_OP2CONSTANT.NE.getSetSizeFromOpArg(opArg'+str(dir_soa+1)+'))') + code('direct_stride_OP2CONSTANT = getSetSizeFromOpArg(opArg'+str(dir_soa+1)+')') + code('!$acc update device(direct_stride_OP2CONSTANT)') + ENDIF() + + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + if host_exec: + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + else: + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + code('opSetCore => set%setPtr') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('exec_size = opSetCore%size + opSetCore%exec_size') + code('numberOfIndirectOpDats = '+str(ninds)) + code('partitionSize = 128 !no effect here, just have to set') + code('') + code('partitionSize=0') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,4)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%col_reord,col_reord_'+name+',(/exec_size/))') + code('CALL c_f_pointer(actualPlan_'+name+'%color2_offsets,offset_'+name+',(/actualPlan_'+name+'%ncolors+1/))') + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + suffix = '' + if not host_exec: + suffix='_d' + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data'+suffix+',opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data'+suffix+',opDat'+str(invinds[g_m]+1)+'Map,(/exec_size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data'+suffix+',opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('allocate(opDat'+str(g_m+1)+'LocalReduction(opArg'+str(g_m+1)+'%dim)) ') + code('opDat'+str(g_m+1)+'LocalReduction = opDat'+str(g_m+1)+'Local') + code('') + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + code('') + + if ninds > 0: #indirect loop host stub call + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. 1') #actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('offset_b = offset_'+name+'(i1 + 1)') + code('nelem = offset_'+name+'(i1 + 1 + 1)') + + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('& opDat'+str(g_m+1)+'LocalReduction(1), &') + else: + code('& opDat'+str(g_m+1)+'Local(1), &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& col_reord_'+name+', exec_size, offset_b, nelem )') + + if reduct and not host_exec: + IF('i1 .EQ. actualPlan_'+name+'%ncolors_owned -1') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'LocalReduction') + ENDIF() + + ENDDO() + else: + code('sliceStart = 0') + code('sliceEnd = opSetCore%size') + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('& opDat'+str(g_m+1)+'LocalReduction(1), &') + else: + code('& opDat'+str(g_m+1)+'Local(1), &') + code('& sliceStart, sliceEnd)') + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + if host_exec: + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + else: + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + + if ninds==0: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'LocalReduction') + code('') + if host_exec: + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + else: + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('deallocate( opDat'+str(g_m+1)+'LocalReduction )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_acckernel.F95','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_acckernel.f90','w') + else: + fid = open(name+'_acckernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openmp.py b/translator/fortran/op2_gen_openmp.py new file mode 100644 index 000000000..732fef062 --- /dev/null +++ b/translator/fortran/op2_gen_openmp.py @@ -0,0 +1,1005 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_openmp(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm('variable declarations') + + code('') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('type ( c_ptr ) , POINTER, DIMENSION(:) :: ind_maps_'+name) + code('type ( c_ptr ) , POINTER, DIMENSION(:) :: mappingArray_'+name) + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_maps'+str(invinds[g_m]+1)+'_'+name) + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('INTEGER(kind=2), POINTER, DIMENSION(:) :: mappingArray'+str(g_m+1)+'_'+name) + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('INTEGER(kind=4) :: mappingArray'+str(g_m+1)+'Size_'+name) + code('') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4) :: blkmapSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_offs_'+name) + code('INTEGER(kind=4) :: ind_offsSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_sizes_'+name) + code('INTEGER(kind=4) :: ind_sizesSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4) :: nelemsSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nthrcol_'+name) + code('INTEGER(kind=4) :: nthrcolSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4) :: offsetSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: thrcol_'+name) + code('INTEGER(kind=4) :: thrcolSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: pnindirect_'+name) + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra == 0: + comm('user function') + code('#include "'+name+'.inc"') + code('') + else: + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + file_text += text + #code(kernels[nk]['mod_file']) + code('') + +########################################################################## +# Generate OpenMP kernel function +########################################################################## + comm('x86 kernel function') + code('SUBROUTINE op_x86_'+name+'( &'); depth = depth + 2 + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+', &') + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+', &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+', &') + + if ninds > 0: #indirect loop + for g_m in range(0,ninds): + code('& ind_maps'+str(invinds[g_m]+1)+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('& mappingArray'+str(g_m+1)+', &') + code('& ind_sizes, &') + code('& ind_offs, &') + code('& blkmap, &') + code('& offset, &') + code('& nelems, &') + code('& nthrcol, &') + code('& thrcol, &') + code('& blockOffset, &') + code('& blockID )') + code('') + else: #direct loop + code('& sliceStart, &') + code('& sliceEnd )') + code('') + + + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4) :: optflags') + if ninds > 0: #indirect loop + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(0:*) :: opDat'+str(invinds[g_m]+1)) + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(0:*) :: opDat'+str(g_m+1)) + elif maps[g_m] == OP_GBL: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)) + #code(typs[g_m]+', DIMENSION(1) :: opDat'+str(g_m+1)) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)) + + code('') + for g_m in range(0,ninds): + code('INTEGER(kind=4), DIMENSION(0:), target :: ind_maps'+str(invinds[g_m]+1)) + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('INTEGER(kind=2), DIMENSION(0:*) :: mappingArray'+str(g_m+1)) + code('') + code('INTEGER(kind=4), DIMENSION(0:*) :: ind_sizes') + code('INTEGER(kind=4), DIMENSION(0:*) :: ind_offs') + code('INTEGER(kind=4), DIMENSION(0:*) :: blkmap') + code('INTEGER(kind=4), DIMENSION(0:*) :: offset') + code('INTEGER(kind=4), DIMENSION(0:*) :: nelems') + code('INTEGER(kind=4), DIMENSION(0:*) :: nthrcol') + code('INTEGER(kind=4), DIMENSION(0:*) :: thrcol') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: threadBlockOffset') + code('INTEGER(kind=4) :: threadBlockID') + code('INTEGER(kind=4) :: numberOfActiveThreads') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:128000 - 1), target :: sharedFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:128000 - 1), target :: sharedInt8') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'IndirectionMap') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'SharedIndirection') + +# for indirect OP_READ, we would pass in a pointer to shared, offset by map, but if opt, then map may not exist, thus we need a separate pointer + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'nBytes') + + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'RoundUp') + for g_m in range(0,ninds): + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (accs[g_m] == OP_INC): + code('REAL(kind=8), DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Map') + + + code('INTEGER(kind=4) :: numOfColours') + code('INTEGER(kind=4) :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('') + code('threadBlockID = blkmap(blockID + blockOffset)') + code('numberOfActiveThreads = nelems(threadBlockID)') + code('threadBlockOffset = offset(threadBlockID)') + code('numberOfActiveThreadsCeiling = numberOfActiveThreads') + code('numOfColours = nthrcol(threadBlockID)') + code('') + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize = ind_sizes('+str(g_m)+' + threadBlockID * '+str(ninds)+')') + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'IndirectionMap => ind_maps'+str(invinds[g_m]+1)+'(ind_offs('+str(g_m)+' + threadBlockID * '+str(ninds)+'):)') + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'RoundUp = opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize * ('+inddims[g_m]+')') + code('opDat'+str(invinds[g_m]+1)+'RoundUp = opDat'+str(invinds[g_m]+1)+'RoundUp + MOD(opDat'+str(invinds[g_m]+1)+'RoundUp,2)') + + for g_m in range(0,ninds): + if g_m>0 and indopts[g_m-1] >= 0: + IF('BTEST(optflags,'+str(optidxs[indopts[g_m-1]])+')') + if g_m == 0: + code('opDat'+str(invinds[g_m]+1)+'nBytes = 0') + else: + prev_size = 0 + if 'real' in typs[invinds[g_m-1]].lower(): + prev_size = 8 + elif 'integer' in typs[invinds[g_m-1]].lower(): + prev_size = 4 + this_size = 0 + if 'real' in typs[invinds[g_m]].lower(): + this_size = 8 + elif 'integer' in typs[invinds[g_m]].lower(): + this_size = 4 + if this_size == 0 or prev_size == 0: + print("ERROR: Unrecognized type") + code('opDat'+str(invinds[g_m]+1)+'nBytes = opDat'+str(invinds[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)+' + opDat'+str(invinds[g_m-1]+1)+'RoundUp * '+str(prev_size)+' / '+str(this_size)) + if g_m>0 and indopts[g_m-1] >= 0: + ELSE() + if g_m==0: + code('opDat'+str(invinds[g_m]+1)+'nBytes = 0') + else: + code('opDat'+str(invinds[g_m]+1)+'nBytes = opDat'+str(invinds[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)) + ENDIF() + + for g_m in range(0,ninds): + if 'REAL' in typs[invinds[g_m]].upper(): + code('opDat'+str(invinds[g_m]+1)+'SharedIndirection => sharedFloat8(opDat'+str(invinds[g_m]+1)+'nBytes:)') + if 'INTEGER' in typs[invinds[g_m]].upper(): + code('opDat'+str(invinds[g_m]+1)+'SharedIndirection => sharedInt8(opDat'+str(invinds[g_m]+1)+'nBytes:)') + code('') + for g_m in range(0,ninds): + if indopts[g_m]>=0: + IF('BTEST(optflags,'+str(optidxs[indopts[g_m]])+')') + DO('i1','0','opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize') + DO('i2','0', inddims[g_m]) + if accs[invinds[g_m]] == OP_READ or accs[invinds[g_m]] == OP_RW or accs[invinds[g_m]] == OP_WRITE: + code('opDat'+str(invinds[g_m]+1)+'SharedIndirection(i2 + i1 * ('+inddims[g_m]+\ + ') + 1) = opDat'+str(invinds[g_m]+1)+'(i2 + opDat'+str(invinds[g_m]+1)+\ + 'IndirectionMap(i1 + 1) * ('+inddims[g_m]+'))') + elif accs[invinds[g_m]] == OP_INC: + code('opDat'+str(invinds[g_m]+1)+'SharedIndirection(i2 + i1 * ('+inddims[g_m]+\ + ') + 1) = 0') + ENDDO() + ENDDO() + if indopts[g_m]>=0: + ENDIF() + code('') + + DO('i1','0','numberOfActiveThreadsCeiling') + code(' colour2 = -1') + IF('i1 < numberOfActiveThreads') + + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + DO('i2','0',dims[g_m]) + code('opDat'+str(g_m+1)+'Local(i2) = 0') + ENDDO() + + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1+mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+'):)') + else: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1+mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * 1:)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(0:)') + ENDIF() + + + else: #direct loop + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code(typs[g_m]+', DIMENSION(0:*) :: opDat'+str(g_m+1)) + else: #global arg + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)) + #code(typs[g_m]+', DIMENSION(1) :: opDat'+str(g_m+1)) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)) + + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('INTEGER(kind=4) :: i1') + + +########################################################################## +# x86 kernel call +########################################################################## + + if ninds > 0: #indirect kernel call + code('') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'((i1 + threadBlockOffset) * ('+dims[g_m]+'):(i1 + threadBlockOffset) * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + else: + line = line + indent + '& opDat'+str(g_m+1)+'((i1 + threadBlockOffset) * 1)' + if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+'):1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * 1)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==1: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1:'+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1)' + elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line +indent + '& opDat'+str(g_m+1)+'Local(0)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1) + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('colour2 = thrcol(i1 + threadBlockOffset)') + ENDIF() + + code('') + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + if optflags[g_m]: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + code('opDat'+str(g_m+1)+'Map = mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset)') + if optflags[g_m]: + ENDIF() + + code('') + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,nargs): + if optflags[g_m]==1 and maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + DO('i2','0',dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + (i2 + opDat'+str(g_m+1)+'Map * ('+dims[g_m]+'))) = opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + (i2 + opDat'+str(g_m+1)+'Map * ('+dims[g_m]+'))) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + if optflags[g_m]==1 and maps[g_m]==OP_MAP and (accs[g_m] == OP_INC): + ENDIF() + if maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + code('') + ENDIF() + ENDDO() + ENDDO() + code('') + for g_m in range(0,ninds): + if indopts[g_m]>=0 and (accs[invinds[g_m]]==OP_INC or accs[invinds[g_m]]==OP_WRITE or accs[invinds[g_m]]==OP_RW): + IF('BTEST(optflags,'+str(optidxs[indopts[g_m]])+')') + if accs[invinds[g_m]] == OP_INC: + DO('i1','0','opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize') + DO('i2','0',inddims[g_m]) + code('opDat'+str(invinds[g_m]+1)+'(i2 + opDat'+str(invinds[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims[g_m]+')) = opDat'+str(invinds[g_m]+1)+'(i2 + opDat'+str(invinds[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims[g_m]+')) + opDat'+str(invinds[g_m]+1)+'SharedIndirection(1 + (i2 + i1 * ('+inddims[g_m]+')))') + ENDDO() + ENDDO() + if accs[invinds[g_m]] == OP_RW or accs[invinds[g_m]] == OP_WRITE: + DO('i1','0','opDat'+str(invinds[g_m]+1)+'SharedIndirectionSize') + DO('i2','0',inddims[g_m]) + code('opDat'+str(invinds[g_m]+1)+'(i2 + opDat'+str(invinds[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims[g_m]+')) = opDat'+str(invinds[g_m]+1)+'SharedIndirection(1 + (i2 + i1 * ('+inddims[g_m]+')))') + ENDDO() + ENDDO() + if indopts[g_m]>=0 and (accs[invinds[g_m]]==OP_INC or accs[invinds[g_m]]==OP_WRITE or accs[invinds[g_m]]==OP_RW): + ENDIF() + + else: #direct kernel call + code('') + comm('kernel call') + DO('i1','sliceStart', 'sliceEnd') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1) + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'(i1 * ('+dims[g_m]+'))' + else: + line = line + indent +'& opDat'+str(g_m+1)+'(i1 * ('+dims[g_m]+'):i1 * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate OpenMP hust stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('type ( op_set_core ) , POINTER :: opSet'+str(invinds[g_m]+1)+'Core') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('type ( op_set_core ) , POINTER :: opSet'+str(g_m+1)+'Core') + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + if (dims[g_m].isdigit()) and (int(dims[g_m]) == 1): + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + + code('') + for g_m in range(0,nargs): + code('type ( op_dat_core ) , POINTER :: opDat'+str(g_m+1)+'Core') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('type ( op_map_core ) , POINTER :: opMap'+str(g_m+1)+'Core') + code('') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatTypesArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('') + + else: + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i11') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + depth = depth - 2 + + if ninds > 0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + code_pre('#endif') + + code('') + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray, 2)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%nindirect,pnindirect_'+name+',(/numberOfIndirectOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_maps,ind_maps_'+name+',(/numberOfIndirectOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%maps,mappingArray_'+name+',(/numberOfOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_sizes,ind_sizes_'+name+',(/actualPlan_'+name+'%nblocks * numberOfIndirectOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_offs,ind_offs_'+name+',(/actualPlan_'+name+'%nblocks * numberOfIndirectOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,nthrcol_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,thrcol_'+name+',(/set%setPtr%size/))') + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(ind_maps_'+name+'('+str(g_m+1)+'),ind_maps'+str(invinds[g_m]+1)+'_'+name+',(/pnindirect_'+name+'('+str(g_m+1)+')/))') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + IF('indirectionDescriptorArray('+str(g_m+1)+') >= 0') + code('CALL c_f_pointer(mappingArray_'+name+'('+str(g_m+1)+'),mappingArray'+str(g_m+1)+'_'+name+',(/set%setPtr%size/))') + ENDIF() + code('') + + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + if (dims[g_m].isdigit() == 0) or (int(dims[g_m]) > 1): + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local)') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opDat'+str(g_m+1)+'Cardinality/))') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i10','1','numberOfThreads+1') + DO('i11','1',dims[g_m]+'+1') + code('reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i11) = 0') + ENDDO() + ENDDO() + + code('') + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + code('!$OMP PARALLEL DO private (threadID)') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('CALL op_x86_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + elif maps[g_m] == OP_GBL and (accs[g_m] == OP_READ or accs[g_m] == OP_WRITE): + code('& opDat'+str(g_m+1)+'Local, &') + + for g_m in range(0,ninds): + code('& ind_maps'+str(invinds[g_m]+1)+'_'+name+', &') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + code('& mappingArray'+str(g_m+1)+'_'+name+', &') + + code('& ind_sizes_'+name+', &') + code('& ind_offs_'+name+', &') + code('& blkmap_'+name+', &') + code('& offset_'+name+', &') + code('& nelems_'+name+', &') + code('& nthrcol_'+name+', &') + code('& thrcol_'+name+', &') + code('& blockOffset,i2)') + + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + code('') + + + else: #direct loop host stub call + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + code('CALL op_x86_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + elif maps[g_m] == OP_GBL and (accs[g_m] == OP_READ or accs[g_m] == OP_WRITE): + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, &') + code('& sliceEnd)') + ENDDO() + code('!$OMP END PARALLEL DO') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i10','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i11','1',dims[g_m]+'+1') + code('opDat'+str(g_m+1)+'Local(i11) = opDat'+str(g_m+1)+'Local(i11) + reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i11)') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_kernel.F95','w') + else: + fid = open(name+'_kernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openmp2.py b/translator/fortran/op2_gen_openmp2.py new file mode 100644 index 000000000..6b37fe210 --- /dev/null +++ b/translator/fortran/op2_gen_openmp2.py @@ -0,0 +1,607 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_openmp2(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + + + code('') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra == 0: + comm('user function') + code('#include "'+name+'.inc"') + code('') + else: + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('subroutine '+name, 'subroutine '+name+'_cpu') + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '_cpu\n' + file_text += text + #code(kernels[nk]['mod_file']) + code('') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + if ninds > 0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + code_pre('#endif') + + code('') + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i1','1','numberOfThreads+1') + DO('i2','1',dims[g_m]+'+1') + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = 0') + ENDDO() + ENDDO() + code('') + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + line = '' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + line = line + ', opDat'+str(g_m+1)+'OptPtr' + code('!$OMP PARALLEL DO private (threadID, blockID, nelem, offset_b'+line+')') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('blockID = blkmap_'+name+'(i2+blockOffset+1)') + code('nelem = nelems_'+name+'(blockID+1)') + code('offset_b = offset_'+name+'(blockID+1)') + DO('n','0','nelem') + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + IF('opArg'+str(g_m+1)+'%opt == 1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + (n+offset_b) * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+'):)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1:)') + ENDIF() + comm('kernel call') + line = 'CALL '+name+'_cpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + (n+offset_b) * ('+dims[g_m]+') : (n+offset_b) * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + (n+offset_b))' + if maps[g_m] == OP_MAP and optflags[g_m]==0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + (n+offset_b) * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+') : opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + (n+offset_b) * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+') * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1 + opDat'+str(invinds[inds[g_m]-1]+1)+'Map(1 + (n+offset_b) * opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+'))' + elif maps[g_m] == OP_MAP and optflags[g_m]==1: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1:'+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1)' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1:'+dims[g_m]+')' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + else: + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + comm('kernel call') + DO('n','sliceStart', 'sliceEnd') + line = 'CALL '+name+'_cpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + n * ('+dims[g_m]+') : n * ('+dims[g_m]+') + '+dims[g_m]+')' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1 + n)' + if maps[g_m] == OP_GBL: + if accs[g_m] == OP_INC: + line = line + indent + '& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1)' + else: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1:'+dims[g_m]+')' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + ENDDO() + code('!$OMP END PARALLEL DO') + + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i1','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','1',dims[g_m]+'+1') + code('opDat'+str(g_m+1)+'Local(i2) = opDat'+str(g_m+1)+'Local(i2) + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2)') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_kernel.F95','w') + else: + fid = open(name+'_kernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openmp3.py b/translator/fortran/op2_gen_openmp3.py new file mode 100644 index 000000000..b3542e907 --- /dev/null +++ b/translator/fortran/op2_gen_openmp3.py @@ -0,0 +1,813 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import glob + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_openmp3(master, date, consts, kernels, hydra,bookleaf): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = list(kernels[nk]['dims']) + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()):# and not (dims[g_m] in ['NPDE','DNTQMU','DNFCROW','1*1']): + needDimList = needDimList + [g_m] + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['master_file']+'_'+kernels[nk]['mod_file'][9:]+'_module_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0 and bookleaf==0: + code('USE OP2_CONSTANTS') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + + code('') + if bookleaf==0: + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + if bookleaf==0: + code('#endif') + + + code('') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'][9:]+'_module' + filename = 'kernels/'+kernels[nk]['master_file']+'_'+name+'.inc' + if not os.path.isfile(filename): + files = [f for f in glob.glob('kernels/*'+name+'.inc')] + if len(files)>0: + filename = files[0] + else: + print('kernel for '+name+' not found') + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('recursive subroutine','subroutine') + text = text.replace(' module',' !module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + + # + # substitute npdes with DNPDE + # +# using_npdes = 0 +# for g_m in range(0,nargs): +# if var[g_m] == 'npdes': +# using_npdes = 1 +# if using_npdes: +# i = re.search('\\bnpdes\\b',text) +# j = i.start() +# i = re.search('\\bnpdes\\b',text[j:]) +# j = j + i.start()+5 +# i = re.search('\\bnpdes\\b',text[j:]) +# j = j + i.start()+5 +# text = text[1:j] + re.sub('\\bnpdes\\b','NPDE',text[j:]) + + file_text += text + file_text += '\n#undef MIN\n' + file_text += '\n#undef MAX\n' + #code(kernels[nk]['mod_file']) + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + file_text += text[i:j]+'\n\n' + else: + comm('user function') + code('#include "'+name+'.inc"') + code('') + + + code('') + +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& bottom,top)') + + code('implicit none') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Dim') + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',*)') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('INTEGER(kind=4) opDat'+str(g_m+1)+'Dim') + if maps[g_m] == OP_ID: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',*)') + elif maps[g_m] == OP_GBL: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(*)') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + + code('INTEGER(kind=4) bottom,top,i1') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + code('') + DO('i1','bottom','top') + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + if maps[g_m] == OP_MAP: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + + ENDDO() + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + #code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + code('n_upper = op_mpi_halo_exchanges_grouped(set%setCPtr,numberOfOpDats,opArgArray,1)') + code('') + + if ninds > 0: + if bookleaf==0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + if bookleaf==0: + code_pre('#endif') + + code('') + if bookleaf==0: + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + if bookleaf==0: + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,2)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + code('') + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + #reductions + for g_m in range(0,nargs): + if optflags[g_m] == 1: + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + IF('opArg'+str(g_m+1)+'%opt == 1') + DO('i1','1','numberOfThreads+1') + DO('i2','1',dims[g_m]+'+1') + if accs[g_m] == OP_INC: + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = 0') + else: + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + ENDDO() + ENDIF() + else: + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i1','1','numberOfThreads+1') + DO('i2','1',dims[g_m]+'+1') + if accs[g_m] == OP_INC: + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = 0') + else: + code('reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2) = opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + ENDDO() + + code('') + + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + #code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,1)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + line = '' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + line = line + ', opDat'+str(g_m+1)+'OptPtr' + code('!$OMP PARALLEL DO private (threadID, blockID, nelem, offset_b'+line+')') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('blockID = blkmap_'+name+'(i2+blockOffset+1)') + code('nelem = nelems_'+name+'(blockID+1)') + code('offset_b = offset_'+name+'(blockID+1)') + + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if accs[g_m] != OP_READ: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& offset_b, offset_b+nelem)') + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + else: + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + comm('kernel call') + code('CALL op_wrap_'+name+'( &') + for g_m in range(0,nargs): + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] != OP_READ: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, sliceEnd)') + ENDDO() + code('!$OMP END PARALLEL DO') + + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + #code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + code('CALL op_mpi_wait_all_grouped(numberOfOpDats,opArgArray,1)') + ENDIF() + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if optflags[g_m] == 1: + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i1','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','1',dims[g_m]+'+1') + if accs[g_m] == OP_INC: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local(i2) = opDat'+str(g_m+1)+'Local(i2) + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2)') + ENDIF() + if accs[g_m] == OP_MIN: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local(i2) = MIN(opDat'+str(g_m+1)+'Local(i2) , reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2))') + ENDIF() + if accs[g_m] == OP_MAX: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local(i2) = MAX(opDat'+str(g_m+1)+'Local(i2) , reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2))') + ENDIF() + ENDDO() + else: + if accs[g_m] == OP_INC: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDIF() + if accs[g_m] == OP_MIN: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local = MIN(opDat'+str(g_m+1)+'Local, reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1))') + ENDIF() + if accs[g_m] == OP_MAX: + IF('opArg'+str(g_m+1)+'%opt == 1') + code('opDat'+str(g_m+1)+'Local = MAX(opDat'+str(g_m+1)+'Local, reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1))') + ENDIF() + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + else: + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i1','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i2','1',dims[g_m]+'+1') + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'Local(i2) = opDat'+str(g_m+1)+'Local(i2) + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2)') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'Local(i2) = MIN(opDat'+str(g_m+1)+'Local(i2) , reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2))') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'Local(i2) = MAX(opDat'+str(g_m+1)+'Local(i2) , reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i2))') + ENDDO() + else: + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'Local = MIN(opDat'+str(g_m+1)+'Local, reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1))') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'Local = MAX(opDat'+str(g_m+1)+'Local, reductionArrayHost'+str(g_m+1)+'((i1 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1))') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if optflags[g_m] == 1: + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + IF('opArg'+str(g_m+1)+'%opt == 1') + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + ENDIF() + code('') + else: + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)' or typs[g_m] == 'real*8' or typs[g_m] == 'r8': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)' or typs[g_m] == 'real*4' or typs[g_m] == 'r4': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)' or typs[g_m] == 'integer*4' or typs[g_m] == 'i4': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'_'+name + fid = open(name+'_ompkernel.F90','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_kernel.f90','w') + else: + fid = open(name+'_kernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openmp4.py b/translator/fortran/op2_gen_openmp4.py new file mode 100644 index 000000000..6ecb67071 --- /dev/null +++ b/translator/fortran/op2_gen_openmp4.py @@ -0,0 +1,1045 @@ +########################################################################## +# +# OpenACC code generator +# +# This routine is called by op2_fortran which parses the input files +# +# It produces a file xxx_acckernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os +import util + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +arg_parse=util.arg_parse +replace_consts=util.replace_consts +replace_npdes=util.replace_npdes +get_stride_string=util.get_stride_string +replace_soa = util.replace_soa +find_function_calls=util.find_function_calls + + +def op2_gen_openmp4(master, date, consts, kernels, hydra,bookleaf): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + set_name = kernels[nk]['set'] + mapnames = kernels[nk]['mapnames'] + invmapinds = kernels[nk]['invmapinds'] + mapinds = kernels[nk]['mapinds'] + nmaps = 0 + if ninds > 0: + nmaps = max(mapinds)+1 + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + stage_soa = nopts + stage_flags=[0]*nargs; + host_exec = 0 + + for g_m in range(0,nargs): + if 'NPDE' in dims[g_m]: + dims[g_m] = dims[g_m].replace('NPDE','6') + try: + newdim = str(eval(dims[g_m])) + dims[g_m] = newdim + except NameError as inst: + dims[g_m] + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not dims[g_m].isdigit(): + print('WARNING: unknown dimension reduction argument '+str(g_m)+' in '+name+': host sequential execution') + host_exec = 1 + for i in range(0,nargs): + soaflags[i] = 0 +# for g_m in range(0,nargs): +# if dims[g_m] == 'NPDE': +# dims[g_m] = '6' + + if 'UPDATE_EXPK' in name: + host_exec=1 + + if host_exec: + for i in range(0,nargs): + soaflags[i] = 0 + + + is_soa = -1 + for i in range(0,nargs): + if soaflags[i] == 1: + is_soa = i + break + + unknown_reduction_size = 0 + needDimList = [] + for g_m in range(0,nargs): + if (not dims[g_m].isdigit()): + found=0 + for string in ['NPDE','DNTQMU','DNFCROW','1*1']: + if string in dims[g_m]: + found=1 + if found==0: + needDimList = needDimList + [g_m] + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MAX or accs[g_m] == OP_MIN): + unknown_reduction_size = 1 + soaflags[g_m] = 1 + is_soa = 1 + + for idx in needDimList: + dims[idx] = 'opDat'+str(idx+1)+'Dim' + + +########################################################################## +# Generate Header +########################################################################## + + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_CONSTANTS') + + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if bookleaf: + code('USE kinds_mod, ONLY: ink,rlk') + code('USE parameters_mod,ONLY: LI') + + code('') + if bookleaf==0: + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + if bookleaf==0: + code('#endif') + + + code('') + + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm(name+'variable declarations') + code('') + + #strides for SoA + if any_soa and not host_exec: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) :: opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT') + code('!$omp declare target(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT)') + dir_soa = -1 + for g_m in range(0,nargs): + if maps[g_m] == OP_ID and ((not dims[g_m].isdigit()) or int(dims[g_m]) > 1): + code('INTEGER(kind=4) :: direct_stride_OP2CONSTANT') + code('!$omp declare target(direct_stride_OP2CONSTANT)') + dir_soa = g_m + break + + code('') + + if is_soa > -1: + code('#define OP2_SOA(var,dim,stride) var((dim-1)*stride+1)') + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + code('') + comm(name + ' user functions (CPU and GPU)') + code('') + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + text = text.replace('recursive subroutine','subroutine') + code('') + #remove all comments + util.const_list = [] + text = re.sub('!.*\n','\n',text) +# if not host_exec: +# text = replace_consts(text) + + text = text.replace('subroutine '+name, 'subroutine '+name+'_gpu') + + using_npdes = 0 + for g_m in range(0,nargs): + if var[g_m] == 'npdes': + using_npdes = 1 + if using_npdes==1: + text = replace_npdes(text) + + if not host_exec: + #find subroutine calls + util.funlist = [name.lower()] + plus_kernels = find_function_calls(text,'') + + if plus_kernels == '': + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + + text = text + '\n' + plus_kernels + for fun in util.funlist: + regex = re.compile('\\b'+fun+'\\b',re.I) + text = regex.sub(fun+'_gpu',text) + + if plus_kernels != '': + print(name) + for i in range(0,nargs): + if soaflags[i]==1 and not (maps[i] ==OP_GBL): + stage_flags[i] = 1; + stage_soa = 1 + + #strip "use" statements + i = re.search('\\buse\\b',text.lower()) + i_offset = 0 + while not (i is None): + i_offset = i_offset+i.start() + if not ('HYDRA_CONST_MODULE' in text[i_offset:i_offset+23]): + text = text[0:i_offset]+'!'+text[i_offset:] + i_offset = i_offset+4 + i = re.search('\\buse\\b',text[i_offset:].lower()) + + + file_text += text + #code(kernels[nk]['mod_file']) + elif bookleaf: + file_text += '!DEC$ ATTRIBUTES FORCEINLINE :: ' + name + '\n' + modfile = kernels[nk]['mod_file'] + prefixes=['./','ale/','utils/','io/','eos/','hydro/','mods/'] + prefix_i=0 + while (prefix_i<7 and (not os.path.exists(prefixes[prefix_i]+modfile))): + prefix_i=prefix_i+1 + fid = open(prefixes[prefix_i]+modfile, 'r') + text = fid.read() + i = re.search('SUBROUTINE '+name+'\\b',text).start() #text.find('SUBROUTINE '+name) + j = i + 10 + text[i+10:].find('SUBROUTINE '+name) + 11 + len(name) + text = text[i:j]+'\n\n' + text = re.sub(r'subroutine\s*'+name, r'subroutine '+name+'_gpu',text,2,re.IGNORECASE) + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,0,hydra,bookleaf) + file_text += text + else: + comm('user function') + fid = open(name+'.inc', 'r') + text = fid.read() + text = replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,1,hydra,bookleaf) + text = text.replace(name, name+'_gpu',1) + code(text) + + + code('') + +########################################################################## +# Generate wrapper to iterate over set +########################################################################## + + code('SUBROUTINE op_wrap_'+name+'( &') + depth = depth + 2 + if nopts >0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opDat'+str(invinds[g_m]+1)+'Dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + code('& opDat'+str(invinds[g_m]+1)+'Size, &') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('& opDat'+str(g_m+1)+'Dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + if ninds > 0: + code('& col_reord, set_size, &') + code('& bottom,top,set_size_full)') + + code('implicit none') + if ninds>0: + code('INTEGER(kind=4) set_size') + code('INTEGER(kind=4) col_reord(set_size)') + code('INTEGER(kind=4) set_size_full') + if nopts>0: + code('INTEGER(kind=4), VALUE :: optflags') + for g_m in range(0,ninds): + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Size') + if invinds[g_m] in needDimList: + code('INTEGER(kind=4) opDat'+str(invinds[g_m]+1)+'Dim') + if soaflags[invinds[g_m]]: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+'*opDat'+str(invinds[g_m]+1)+'Size)') + else: + code(typs[invinds[g_m]]+' opDat'+str(invinds[g_m]+1)+'Local('+str(dims[invinds[g_m]])+',opDat'+str(invinds[g_m]+1)+'Size)') + for g_m in range(0,nargs): + if maps[g_m] != OP_MAP: + if g_m in needDimList: + code('INTEGER(kind=4) opDat'+str(g_m+1)+'Dim') + if maps[g_m] == OP_ID: + if soaflags[g_m]: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+'*set_size_full)') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+',set_size_full)') + elif maps[g_m] == OP_GBL: + if accs[g_m]!=OP_READ and accs[g_m]!=OP_WRITE and dims[g_m].isdigit() and int(dims[g_m])==1: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+' opDat'+str(g_m+1)+'Local('+str(dims[g_m])+')') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim') + code('INTEGER(kind=4) opDat'+str(invinds[inds[g_m]-1]+1)+'Map(opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim*set_size)') + + + if not host_exec: + #when functions call functions, we can no longer reliably do SoA, therefore we need to stage everything in registers + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if g_m in needDimList: + print('Error, cannot statically determine dim of argument '+str(g_m+1)+' in kernel '+name) + sys.exit(-1) + code(typs[g_m]+', DIMENSION('+dims[g_m]+') :: opDat'+str(g_m+1)+'Staged') + + code('') + + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code(typs[g_m]+' opDat'+str(g_m+1)+'Local_'+str(d+1)) + code(typs[g_m]+' opDat'+str(g_m+1)+'LocalArr('+dims[g_m]+')') + + + code('INTEGER(kind=4) bottom,top,i1,i2') + if nmaps > 0: + k = [] + line = 'INTEGER(kind=4) ' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line += 'map'+str(mapinds[g_m]+1)+'idx, ' + code(line[:-2]) + if stage_soa>0: + code('INTEGER(kind=4) i3') + + code('') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = opDat'+str(g_m+1)+'Local('+str(d+1)+')') + + code('') + + line = '!$omp target teams distribute parallel do &\n' + for g_m in range(0,ninds): + line = line + '!$omp& map(to:opDat'+str(invinds[g_m]+1)+'Local) &\n' + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + line = line + '!$omp& map(to:opDat'+str(g_m+1)+'Local) &\n' + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + line = line + '!$omp& map(to:opDat'+str(invinds[inds[g_m]-1]+1)+'Map) &\n' + if ninds > 0: + line = line + '!$omp& map(to:col_reord) private(i1) &\n' + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapinds[g_m] in k): + k = k + [mapinds[g_m]] + line = line + '!$omp& private(map'+str(mapinds[g_m]+1)+'idx) &\n' + + + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + if int(dims[g_m])==1: + if accs[g_m] == OP_INC: + line = line + '!$omp& reduction(+:'+'opDat'+str(g_m+1)+'Local) map(tofrom:opDat'+str(g_m+1)+'Local) &\n' + if accs[g_m] == OP_MIN: + line = line + '!$omp& reduction(min:'+'opDat'+str(g_m+1)+'Local) map(tofrom:opDat'+str(g_m+1)+'Local) &\n' + if accs[g_m] == OP_MAX: + line = line + '!$omp& reduction(max:'+'opDat'+str(g_m+1)+'Local) map(tofrom:opDat'+str(g_m+1)+'Local) &\n' + else: + for d in range(0,int(dims[g_m])): + if accs[g_m] == OP_INC: + line = line + '!$omp& reduction(+:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') map(tofrom:opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if accs[g_m] == OP_MIN: + line = line + '!$omp& reduction(min:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') map(tofrom:opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if accs[g_m] == OP_MAX: + line = line + '!$omp& reduction(max:'+'opDat'+str(g_m+1)+'Local_'+str(d+1)+') map(tofrom:opDat'+str(g_m+1)+'Local_'+str(d+1)+') &\n' + if stage_soa>0: + line = line + '!$omp& private(i3) &\n' + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + '!$omp& private(opDat'+str(g_m+1)+'Staged) &\n' + + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + line = line + '!$omp& private(opDat'+str(g_m+1)+'LocalArr) &\n' + line = line[:-2] + if not host_exec: + code(line) + + if ninds > 0 and not host_exec: + DO('i2','bottom','top') + code('i1 = col_reord(i2+1)') + else: + DO('i1','bottom','top') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'LocalArr = 0') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'LocalArr = HUGE(opDat'+str(g_m+1)+'Local_1)') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'LocalArr = -HUGE(opDat'+str(g_m+1)+'Local_1)') + + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and ((not (optflags[g_m]*nargs+mapinds[g_m]) in k) and (not mapinds[g_m] in k)): + k = k + [(optflags[g_m]*nargs+mapinds[g_m])] + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if host_exec: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 * opDat'+str(invmapinds[inds[g_m]-1]+1)+'MapDim + '+str(int(idxs[g_m])-1)+')+1') + else: + code('map'+str(mapinds[g_m]+1)+'idx = opDat'+str(invmapinds[inds[g_m]-1]+1)+'Map(1 + i1 + set_size * '+str(int(idxs[g_m])-1)+')+1') + if optflags[g_m]==1: + ENDIF() + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i3+1) = opDat'+str(invinds[inds[g_m]-1]+1)+'Local &') + code(' & (i3 * '+get_stride_string(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx)') + ENDDO() + else: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Staged(i3+1) = opDat'+str(g_m+1)+'Local &') + code(' & (1 + i3 * direct_stride_OP2CONSTANT + i1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + + comm('kernel call') + line = 'CALL '+name+'_gpu( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if stage_flags[g_m] == 1: + line = line + indent + '& opDat'+str(g_m+1)+'Staged' + elif maps[g_m] == OP_ID: + if soaflags[g_m]: + line = line + indent + '& opDat'+str(g_m+1)+'Local(i1+1)' + else: + line = line + indent + '& opDat'+str(g_m+1)+'Local(1,i1+1)' + elif maps[g_m] == OP_MAP: + if soaflags[g_m]: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(map'+str(mapinds[g_m]+1)+'idx)' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'Local(1,map'+str(mapinds[g_m]+1)+'idx)' + elif maps[g_m] == OP_GBL: + if accs[g_m]!=OP_READ and accs[g_m] != OP_WRITE: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line + indent +'& opDat'+str(g_m+1)+'Local' + else: + if host_exec: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + else: + line = line + indent +'& opDat'+str(g_m+1)+'LocalArr' + else: + line = line + indent +'& opDat'+str(g_m+1)+'Local(1)' + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + code(line + indent + '& )') + if not host_exec: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + if accs[g_m] == OP_INC: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = opDat'+str(g_m+1)+'Local_'+str(d+1)+' + opDat'+str(g_m+1)+'LocalArr('+str(d+1)+')') + if accs[g_m] == OP_MIN: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = MIN(opDat'+str(g_m+1)+'Local_'+str(d+1)+', opDat'+str(g_m+1)+'LocalArr('+str(d+1)+'))') + if accs[g_m] == OP_MAX: + code('opDat'+str(g_m+1)+'Local_'+str(d+1)+' = MAX(opDat'+str(g_m+1)+'Local_'+str(d+1)+', opDat'+str(g_m+1)+'LocalArr('+str(d+1)+'))') + + for g_m in range(0,nargs): + if stage_flags[g_m] == 1 and accs[g_m] != OP_READ: + if optflags[g_m]==1: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if maps[g_m] == OP_MAP: + DO('i3','0', dims[g_m]) + code('opDat'+str(invinds[inds[g_m]-1]+1)+'Local(i3 * '+get_stride_string(g_m,maps,mapnames,set_name)+' + map'+str(mapinds[g_m]+1)+'idx) = &') + code(' & opDat'+str(g_m+1)+'Staged(i3+1)') + ENDDO() + else: + DO('i3','0', dims[g_m]) + code('opDat'+str(g_m+1)+'Local(1 + i3 * direct_stride_OP2CONSTANT + i1) = &') + code(' & opDat'+str(g_m+1)+'Staged(i3+1)') + ENDDO() + if optflags[g_m]==1: + ENDIF() + depth = depth + 2 + + depth = depth - 2 + ENDDO() + code('!$omp end target teams distribute parallel do') + if not host_exec: + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and (not dims[g_m].isdigit() or int(dims[g_m])>1): + for d in range(0,int(dims[g_m])): + code('opDat'+str(g_m+1)+'Local('+str(d+1)+') = opDat'+str(g_m+1)+'Local_'+str(d+1)) + depth = depth - 2 + code('END SUBROUTINE') + +########################################################################## +# Generate OpenMP host stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('INTEGER(kind=4), SAVE :: calledTimes=0') + + if ninds > 0: #if indirect loop + code('INTEGER(kind=4) :: exec_size') + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: col_reord_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: nelem') + code('INTEGER(kind=4) :: offset_b') + else: + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + code(typs[g_m]+', ALLOCATABLE, DIMENSION(:) :: opDat'+str(g_m+1)+'LocalReduction') + + code('') + code('INTEGER(kind=4) :: i1,i2,n') + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.0_8, 0.00000_4,0.00000_4, 0)') + + #managing constants + if any_soa and not host_exec: + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + IF('(calledTimes.EQ.0).OR.(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT.NE.getSetSizeFromOpArg(opArg'+str(g_m+1)+'))') + code('opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT = getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + code('!$omp target update to(opDat'+str(invinds[inds[g_m]-1]+1)+'_stride_OP2CONSTANT)') + ENDIF() + if dir_soa!=-1: + IF('(calledTimes.EQ.0).OR.(direct_stride_OP2CONSTANT.NE.getSetSizeFromOpArg(opArg'+str(dir_soa+1)+'))') + code('direct_stride_OP2CONSTANT = getSetSizeFromOpArg(opArg'+str(dir_soa+1)+')') + code('!$omp target update to(direct_stride_OP2CONSTANT)') + ENDIF() + + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + if host_exec: + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + else: + code('n_upper = op_mpi_halo_exchanges_cuda(set%setCPtr,numberOfOpDats,opArgArray)') + code('') + + code('opSetCore => set%setPtr') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('exec_size = opSetCore%size + opSetCore%exec_size') + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('partitionSize=0') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray,4)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%col_reord,col_reord_'+name+',(/exec_size/))') + code('CALL c_f_pointer(actualPlan_'+name+'%color2_offsets,offset_'+name+',(/actualPlan_'+name+'%ncolors+1/))') + + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + + suffix = '' + if not host_exec: + suffix='_d' + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data'+suffix+',opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data'+suffix+',opDat'+str(invinds[g_m]+1)+'Map,(/exec_size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data'+suffix+',opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opArg'+str(g_m+1)+'%dim/))') + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('allocate(opDat'+str(g_m+1)+'LocalReduction(opArg'+str(g_m+1)+'%dim)) ') + code('opDat'+str(g_m+1)+'LocalReduction = opDat'+str(g_m+1)+'Local') + code('') + + for idx in needDimList: + dims[idx] = 'opArg'+str(idx+1)+'%dim' + + code('') + + if ninds > 0: #indirect loop host stub call + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. 1') #actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('offset_b = offset_'+name+'(i1 + 1)') + code('nelem = offset_'+name+'(i1 + 1 + 1)') + + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + code('& getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+'), &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('& opDat'+str(g_m+1)+'LocalReduction(1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + if nmaps > 0: + k = [] + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (not mapnames[g_m] in k): + k = k + [mapnames[g_m]] + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'Map, &') + code('& opDat'+str(invinds[inds[g_m]-1]+1)+'MapDim, &') + code('& col_reord_'+name+', exec_size, offset_b, nelem, exec_size+opSetCore%nonexec_size )') + + if reduct and not host_exec: + IF('i1 .EQ. actualPlan_'+name+'%ncolors_owned -1') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'LocalReduction') + ENDIF() + + ENDDO() + else: + code('sliceStart = 0') + code('sliceEnd = opSetCore%size') + code('CALL op_wrap_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + if invinds[g_m] in needDimList: + code('& opArg'+str(invinds[g_m]+1)+'%dim, &') + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + code('& getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+'), &') + for g_m in range(0,nargs): + if g_m in needDimList: + code('& opArg'+str(g_m+1)+'%dim, &') + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL: + if accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('& opDat'+str(g_m+1)+'LocalReduction(1), &') + else: + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, sliceEnd, opSetCore%size+opSetCore%exec_size+opSetCore%nonexec_size)') + + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + if host_exec: + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + else: + code('CALL op_mpi_wait_all_cuda(numberOfOpDats,opArgArray)') + ENDIF() + + if ninds==0: + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'LocalReduction') + code('') + if host_exec: + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + else: + code('CALL op_mpi_set_dirtybit_cuda(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] != OP_READ and accs[g_m] != OP_WRITE and not host_exec: + code('deallocate( opDat'+str(g_m+1)+'LocalReduction )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ or accs[g_m] == OP_WRITE: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000_4, 1)') + code('calledTimes = calledTimes + 1') + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_omp4kernel.F95','w') + elif bookleaf: + fid = open(prefixes[prefix_i]+name+'_omp4kernel.F90','w') + else: + fid = open(name+'_omp4kernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/op2_gen_openmpINC.py b/translator/fortran/op2_gen_openmpINC.py new file mode 100644 index 000000000..14f1d49d6 --- /dev/null +++ b/translator/fortran/op2_gen_openmpINC.py @@ -0,0 +1,1060 @@ +########################################################################## +# +# OpenMP code generator +# +# This routine is called by op2 which parses the input files +# +# It produces a file xxx_kernel.F90 for each kernel, +# plus a master kernel file +# +########################################################################## + +import re +import datetime +import os + +def comm(line): + global file_text, FORTRAN, CPP + global depth + if len(line) == 0: + prefix = '' + else: + prefix = ' '*depth + if len(line) == 0: + file_text +='\n' + elif FORTRAN: + file_text +='! '+line+'\n' + elif CPP: + file_text +=prefix+'//'+line+'\n' + +def rep(line,m): + global dims, idxs, typs, indtyps, inddims + + if FORTRAN: + if m < len(inddims): + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m+1),line) + line = re.sub('DIMS',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m+1),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + elif CPP: + line = re.sub('INDDIM',str(inddims[m]),line) + line = re.sub('INDTYP',str(indtyps[m]),line) + + line = re.sub('INDARG','ind_arg'+str(m),line) + line = re.sub('DIM',str(dims[m]),line) + line = re.sub('ARG','arg'+str(m),line) + line = re.sub('TYP',typs[m],line) + line = re.sub('IDX',str(int(idxs[m])),line) + return line + +def code(text): + global file_text, FORTRAN, CPP, g_m + global depth + if len(text) == 0: + file_text += '\n' + return + if len(text) == 0: + prefix = '' + else: + prefix = ' '*depth + if FORTRAN: + file_text += prefix+rep(text,g_m)+'\n' + elif CPP: + file_text += prefix+rep(text,g_m)+'\n' + +def code_pre(text): + global file_text, FORTRAN, CPP, g_m + if FORTRAN: + file_text += rep(text,g_m)+'\n' + elif CPP: + file_text += rep(text,g_m)+'\n' + +def DO(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1, 1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def FOR(i,start,finish): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('DO '+i+' = '+start+', '+finish+'-1') + elif CPP: + code('for ( int '+i+'='+start+'; '+i+'<'+finish+'; '+i+'++ ){') + depth += 2 + +def ENDDO(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def ENDFOR(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END DO') + elif CPP: + code('}') + +def IF(line): + global file_text, FORTRAN, CPP, g_m + global depth + if FORTRAN: + code('IF ('+line+') THEN') + elif CPP: + code('if ('+ line + ') {') + depth += 2 + +def ELSE(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('ELSE') + elif CPP: + code('else {') + depth += 2 + +def ENDIF(): + global file_text, FORTRAN, CPP, g_m + global depth + depth -= 2 + if FORTRAN: + code('END IF') + elif CPP: + code('}') + + +def op2_gen_openmpINC(master, date, consts, kernels, hydra): + + global dims, idxs, typs, indtyps, inddims + global FORTRAN, CPP, g_m, file_text, depth + + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + accsstring = ['OP_READ','OP_WRITE','OP_RW','OP_INC','OP_MAX','OP_MIN' ] + + any_soa = 0 + for nk in range (0,len(kernels)): + any_soa = any_soa or sum(kernels[nk]['soaflags']) + +########################################################################## +# create new kernel file +########################################################################## + + for nk in range (0,len(kernels)): + name = kernels[nk]['name'] + nargs = kernels[nk]['nargs'] + dims = kernels[nk]['dims'] + maps = kernels[nk]['maps'] + var = kernels[nk]['var'] + typs = kernels[nk]['typs'] + accs = kernels[nk]['accs'] + idxs = kernels[nk]['idxs'] + inds = kernels[nk]['inds'] + soaflags = kernels[nk]['soaflags'] + optflags = kernels[nk]['optflags'] + ninds = kernels[nk]['ninds'] + inddims = kernels[nk]['inddims'] + indaccs = kernels[nk]['indaccs'] + indtyps = kernels[nk]['indtyps'] + invinds = kernels[nk]['invinds'] + + optidxs = [0]*nargs + indopts = [-1]*nargs + nopts = 0 + for i in range(0,nargs): + if optflags[i] == 1 and maps[i] == OP_ID: + optidxs[i] = nopts + nopts = nopts+1 + elif optflags[i] == 1 and maps[i] == OP_MAP: + if i == invinds[inds[i]-1]: #i.e. I am the first occurence of this dat+map combination + optidxs[i] = nopts + indopts[inds[i]-1] = i + nopts = nopts+1 + else: + optidxs[i] = optidxs[invinds[inds[i]-1]] + +# +# set two logicals +# + j = -1 + for i in range(0,nargs): + if maps[i] == OP_MAP and accs[i] == OP_INC: + j = i + ind_inc = j >= 0 + + j = -1 + for i in range(0,nargs): + if maps[i] == OP_GBL and accs[i] != OP_READ: + j = i + reduct = j >= 0 + + + FORTRAN = 1; + CPP = 0; + g_m = 0; + file_text = '' + depth = 0 + + ninds_staged = 0 + inds_staged = [-1]*nargs + for i in range(0,nargs): + if maps[i]==OP_MAP and accs[i]==OP_INC: + if inds_staged[invinds[inds[i]-1]] == -1: + inds_staged[i] = ninds_staged + ninds_staged = ninds_staged + 1 + else: + inds_staged[i] = inds_staged[invinds[inds[i]-1]] + invinds_staged = [-1]*ninds_staged + inddims_staged = [-1]*ninds_staged + indopts_staged = [-1]*ninds_staged + for i in range(0,nargs): + if inds_staged[i] >= 0 and invinds_staged[inds_staged[i]] == -1: + invinds_staged[inds_staged[i]] = i + inddims_staged[inds_staged[i]] = dims[i] + if optflags[i] == 1: + indopts_staged[inds_staged[i]] = i + for i in range(0,nargs): + inds_staged[i] = inds_staged[i] + 1 + +########################################################################## +# Generate Header +########################################################################## + if hydra: + code('MODULE '+kernels[nk]['mod_file'][4:]+'_MODULE') + else: + code('MODULE '+name.upper()+'_MODULE') + code('USE OP2_FORTRAN_DECLARATIONS') + code('USE OP2_FORTRAN_RT_SUPPORT') + code('USE ISO_C_BINDING') + if hydra == 0: + code('USE OP2_CONSTANTS') + + code('') + code('#ifdef _OPENMP'); depth = depth + 2 + code('USE OMP_LIB'); depth = depth - 2 + code('#endif') + +########################################################################## +# Variable declarations +########################################################################## + code('') + comm('variable declarations') + + code('') + + if ninds > 0: #if indirect loop + code('LOGICAL :: firstTime_'+name+' = .TRUE.') + code('type ( c_ptr ) :: planRet_'+name) + code('type ( op_plan ) , POINTER :: actualPlan_'+name) + code('type ( c_ptr ) , POINTER, DIMENSION(:) :: ind_maps_'+name) + code('type ( c_ptr ) , POINTER, DIMENSION(:) :: mappingArray_'+name) + code('') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_maps'+str(invinds_staged[g_m]+1)+'_'+name) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), POINTER, DIMENSION(:) :: mappingArray'+str(g_m+1)+'_'+name) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=4) :: mappingArray'+str(g_m+1)+'Size_'+name) + code('') + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: blkmap_'+name) + code('INTEGER(kind=4) :: blkmapSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_offs_'+name) + code('INTEGER(kind=4) :: ind_offsSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ind_sizes_'+name) + code('INTEGER(kind=4) :: ind_sizesSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nelems_'+name) + code('INTEGER(kind=4) :: nelemsSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: nthrcol_'+name) + code('INTEGER(kind=4) :: nthrcolSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: offset_'+name) + code('INTEGER(kind=4) :: offsetSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: thrcol_'+name) + code('INTEGER(kind=4) :: thrcolSize_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: ncolblk_'+name) + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: pnindirect_'+name) + +########################################################################## +# Inline user kernel function +########################################################################## + code('') + code('CONTAINS') + code('') + if hydra == 0: + comm('user function') + code('#include "'+name+'.inc"') + code('') + else: + modfile = kernels[nk]['mod_file'][4:] + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name + '.F95' + if not os.path.isfile(filename): + filename = modfile.split('_')[1].lower() + '/' + modfile.split('_')[0].lower() + '/' + name[:-1] + '.F95' + fid = open(filename, 'r') + text = fid.read() + fid.close() + text = text.replace('module','!module') + text = text.replace('contains','!contains') + text = text.replace('end !module','!end module') + file_text += text + #code(kernels[nk]['mod_file']) + code('') + +########################################################################## +# Generate OpenMP kernel function +########################################################################## + comm('x86 kernel function') + code('SUBROUTINE op_x86_'+name+'( &'); depth = depth + 2 + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+', &') + for g_m in range(0,ninds): + if inds_staged[invinds[g_m]] == 0: + code('& opMap'+str(invinds[g_m]+1)+', &') + code('& opMap'+str(invinds[g_m]+1)+'Dim, &') + + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+', &') + elif maps[g_m] == OP_GBL: + code('& opDat'+str(g_m+1)+', &') + + if ninds > 0: #indirect loop + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+', &') + code('& ind_sizes, &') + code('& ind_offs, &') + code('& blkmap, &') + code('& offset, &') + code('& nelems, &') + code('& nthrcol, &') + code('& thrcol, &') + code('& blockOffset, &') + code('& blockID )') + code('') + else: #direct loop + code('& sliceStart, &') + code('& sliceEnd )') + code('') + + + code('IMPLICIT NONE') + code('') + +########################################################################## +# Declare local variables +########################################################################## + comm('local variables') + if nopts>0: + code('INTEGER(kind=4) :: optflags') + if ninds > 0: #indirect loop + for g_m in range(0,ninds): + code(typs[invinds[g_m]]+', DIMENSION(0:*) :: opDat'+str(invinds[g_m]+1)) + for g_m in range(0,ninds): + if inds_staged[invinds[g_m]] == 0: + code('INTEGER(kind=4), DIMENSION(*) :: opMap'+str(invinds[g_m]+1)) + code('INTEGER(kind=4) :: opMap'+str(invinds[g_m]+1)+'Dim') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code(typs[g_m]+', DIMENSION(0:*) :: opDat'+str(g_m+1)) + elif maps[g_m] == OP_GBL: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)) + #code(typs[g_m]+', DIMENSION(1) :: opDat'+str(g_m+1)) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)) + + code('') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), DIMENSION(0:), target :: ind_maps'+str(invinds_staged[g_m]+1)) + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('INTEGER(kind=2), DIMENSION(0:*) :: mappingArray'+str(g_m+1)) + code('') + code('INTEGER(kind=4), DIMENSION(0:*) :: ind_sizes') + code('INTEGER(kind=4), DIMENSION(0:*) :: ind_offs') + code('INTEGER(kind=4), DIMENSION(0:*) :: blkmap') + code('INTEGER(kind=4), DIMENSION(0:*) :: offset') + code('INTEGER(kind=4), DIMENSION(0:*) :: nelems') + code('INTEGER(kind=4), DIMENSION(0:*) :: nthrcol') + code('INTEGER(kind=4), DIMENSION(0:*) :: thrcol') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: blockID') + code('INTEGER(kind=4) :: threadBlockOffset') + code('INTEGER(kind=4) :: threadBlockID') + code('INTEGER(kind=4) :: numberOfActiveThreads') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + add_real = 0 + add_int = 0 + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + if 'real' in typs[g_m].lower(): + add_real = 1 + elif 'integer' in typs[g_m].lower(): + add_int = 1 + if add_real: + code('REAL(kind=8), DIMENSION(0:128000 - 1), target :: sharedFloat8') + if add_int: + code('INTEGER(kind=4), DIMENSION(0:128000 - 1), target :: sharedInt8') + code('') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds_staged[g_m]+1)+'IndirectionMap') + code(typs[invinds_staged[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection') + +# for indirect OP_READ, we would pass in a pointer to shared, offset by map, but if opt, then map may not exist, thus we need a separate pointer + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'OptPtr') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4) :: opDat'+str(invinds_staged[g_m]+1)+'nBytes') + + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4) :: opDat'+str(invinds_staged[g_m]+1)+'RoundUp') + for g_m in range(0,ninds_staged): + code('INTEGER(kind=4) :: opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize') + + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP and (accs[g_m] == OP_INC): + code('REAL(kind=8), DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Map') + + + code('INTEGER(kind=4) :: numOfColours') + code('INTEGER(kind=4) :: numberOfActiveThreadsCeiling') + code('INTEGER(kind=4) :: colour1') + code('INTEGER(kind=4) :: colour2') + code('') + code('threadBlockID = blkmap(blockID + blockOffset)') + code('numberOfActiveThreads = nelems(threadBlockID)') + code('threadBlockOffset = offset(threadBlockID)') + code('numberOfActiveThreadsCeiling = numberOfActiveThreads') + code('numOfColours = nthrcol(threadBlockID)') + code('') + + for g_m in range(0,ninds_staged): + code('opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize = ind_sizes('+str(g_m)+' + threadBlockID * '+str(ninds_staged)+')') + + for g_m in range(0,ninds_staged): + code('opDat'+str(invinds_staged[g_m]+1)+'IndirectionMap => ind_maps'+str(invinds_staged[g_m]+1)+'(ind_offs('+str(g_m)+' + threadBlockID * '+str(ninds_staged)+'):)') + + for g_m in range(0,ninds_staged): + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize * ('+inddims_staged[g_m]+')') + code('opDat'+str(invinds_staged[g_m]+1)+'RoundUp = opDat'+str(invinds_staged[g_m]+1)+'RoundUp + MOD(opDat'+str(invinds_staged[g_m]+1)+'RoundUp,2)') + + for g_m in range(0,ninds_staged): + if g_m>0 and indopts_staged[g_m-1] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m-1]])+')') + if g_m == 0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + prev_size = 0 + if 'real' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 8 + elif 'integer' in typs[invinds_staged[g_m-1]].lower(): + prev_size = 4 + this_size = 0 + if 'real' in typs[invinds_staged[g_m]].lower(): + this_size = 8 + elif 'integer' in typs[invinds_staged[g_m]].lower(): + this_size = 4 + if this_size == 0 or prev_size == 0: + print("ERROR: Unrecognized type") + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)+' + opDat'+str(invinds_staged[g_m-1]+1)+'RoundUp * '+str(prev_size)+' / '+str(this_size)) + if g_m>0 and indopts_staged[g_m-1] > 0: + ELSE() + if g_m==0: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = 0') + else: + code('opDat'+str(invinds_staged[g_m]+1)+'nBytes = opDat'+str(invinds_staged[g_m-1]+1)+'nBytes * '+str(prev_size)+\ + ' / '+str(this_size)) + ENDIF() + + for g_m in range(0,ninds_staged): + if 'REAL' in typs[invinds_staged[g_m]].upper(): + code('opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection => sharedFloat8(opDat'+str(invinds_staged[g_m]+1)+'nBytes:)') + if 'INTEGER' in typs[invinds_staged[g_m]].upper(): + code('opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection => sharedInt8(opDat'+str(invinds_staged[g_m]+1)+'nBytes:)') + code('') + for g_m in range(0,ninds_staged): + if indopts_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m]])+')') + DO('i1','0','opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize') + DO('i2','0', inddims_staged[g_m]) + if accs[invinds_staged[g_m]] == OP_READ or accs[invinds_staged[g_m]] == OP_RW or accs[invinds_staged[g_m]] == OP_WRITE: + code('opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection(i2 + i1 * ('+inddims_staged[g_m]+\ + ') + 1) = opDat'+str(invinds_staged[g_m]+1)+'(i2 + opDat'+str(invinds_staged[g_m]+1)+\ + 'IndirectionMap(i1 + 1) * ('+inddims_staged[g_m]+'))') + elif accs[invinds_staged[g_m]] == OP_INC: + code('opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection(i2 + i1 * ('+inddims_staged[g_m]+\ + ') + 1) = 0') + ENDDO() + ENDDO() + if indopts_staged[g_m] > 0: + ENDIF() + code('') + + DO('i1','0','numberOfActiveThreadsCeiling') + code(' colour2 = -1') + IF('i1 < numberOfActiveThreads') + + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP and inds_staged[g_m] > 0: + DO('i2','0',dims[g_m]) + code('opDat'+str(g_m+1)+'Local(i2) = 0') + ENDDO() + + for g_m in range(0,nargs): + if (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and maps[g_m] == OP_MAP and optflags[g_m]==1: + if inds_staged[g_m] > 0: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1+mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+'):)') + else: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1+mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * 1:)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(0:)') + ENDIF() + else: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'(opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * ('+dims[g_m]+'):)') + else: + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'(opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim):)') + ELSE() + code('opDat'+str(g_m+1)+'OptPtr => opDat'+str(invinds[inds[g_m]-1]+1)+'(0:)') + ENDIF() + + + else: #direct loop + for g_m in range(0,nargs): + if maps[g_m] != OP_GBL: + code(typs[g_m]+', DIMENSION(0:*) :: opDat'+str(g_m+1)) + else: #global arg + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code(typs[g_m]+' :: opDat'+str(g_m+1)) + #code(typs[g_m]+', DIMENSION(1) :: opDat'+str(g_m+1)) + else: + code(typs[g_m]+', DIMENSION(0:'+dims[g_m]+'-1) :: opDat'+str(g_m+1)) + + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('INTEGER(kind=4) :: i1') + + +########################################################################## +# x86 kernel call +########################################################################## + + if ninds > 0: #indirect kernel call + code('') + comm('kernel call') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line + indent + '& opDat'+str(g_m+1)+'((i1 + threadBlockOffset) * ('+dims[g_m]+'):(i1 + threadBlockOffset) * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + else: + line = line + indent + '& opDat'+str(g_m+1)+'((i1 + threadBlockOffset) * 1)' + if maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==0: + if inds_staged[g_m] > 0: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+'):1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset) * 1)' + else: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'(opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * ('+dims[g_m]+'):opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + else: + line = line +indent + '& opDat'+str(invinds[inds[g_m]-1]+1)+'(opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * 1)' + elif maps[g_m] == OP_MAP and (accs[g_m] == OP_READ or accs[g_m] == OP_RW or accs[g_m] == OP_WRITE) and optflags[g_m]==1: + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1:'+dims[g_m]+')' + else: + line = line +indent + '& opDat'+str(g_m+1)+'OptPtr(1)' + elif maps[g_m] == OP_MAP and accs[g_m] == OP_INC: + if dims[g_m].isdigit() and int(dims[g_m])==1: + line = line +indent + '& opDat'+str(g_m+1)+'Local(0)' + else: + line = line +indent + '& opDat'+str(g_m+1)+'Local' + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1) + if g_m < nargs-1: + line = line +', &' + else: + line = line +' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + code('colour2 = thrcol(i1 + threadBlockOffset)') + ENDIF() + + code('') + for g_m in range(0,nargs): + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP and inds_staged[g_m] > 0: + if optflags[g_m]: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + code('opDat'+str(g_m+1)+'Map = mappingArray'+str(g_m+1)+'(i1 + threadBlockOffset)') + if optflags[g_m]: + ENDIF() + + code('') + DO('colour1','0','numOfColours') + IF('colour2 .EQ. colour1') + for g_m in range(0,nargs): + if optflags[g_m]==1 and maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + IF('BTEST(optflags,'+str(optidxs[g_m])+')') + if accs[g_m] == OP_INC and maps[g_m] == OP_MAP: + DO('i2','0',dims[g_m]) + if inds_staged[g_m] > 0: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + (i2 + opDat'+str(g_m+1)+'Map * ('+dims[g_m]+'))) = opDat'+str(invinds[inds[g_m]-1]+1)+'SharedIndirection(1 + (i2 + opDat'+str(g_m+1)+'Map * ('+dims[g_m]+'))) + opDat'+str(g_m+1)+'Local(i2)') + else: + code('opDat'+str(invinds[inds[g_m]-1]+1)+'(i2 + opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * ('+dims[g_m]+')) = opDat'+str(invinds[inds[g_m]-1]+1)+'(i2 + opMap'+str(invinds[inds[g_m]-1]+1)+'('+str(idxs[g_m])+'+(i1 + threadBlockOffset)*opMap'+str(invinds[inds[g_m]-1]+1)+'Dim) * ('+dims[g_m]+')) + opDat'+str(g_m+1)+'Local(i2)') + ENDDO() + if optflags[g_m]==1 and maps[g_m]==OP_MAP and (accs[g_m] == OP_INC): + ENDIF() + if maps[g_m]==OP_MAP and accs[g_m] == OP_INC: + code('') + ENDIF() + ENDDO() + ENDDO() + code('') + for g_m in range(0,ninds_staged): + if indopts_staged[g_m] > 0 and (accs[invinds_staged[g_m]]==OP_INC or accs[invinds_staged[g_m]]==OP_WRITE or accs[invinds_staged[g_m]]==OP_RW): + IF('BTEST(optflags,'+str(optidxs[indopts_staged[g_m]])+')') + if accs[invinds_staged[g_m]] == OP_INC: + DO('i1','0','opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize') + DO('i2','0',inddims_staged[g_m]) + code('opDat'+str(invinds_staged[g_m]+1)+'(i2 + opDat'+str(invinds_staged[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims_staged[g_m]+')) = opDat'+str(invinds_staged[g_m]+1)+'(i2 + opDat'+str(invinds_staged[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims_staged[g_m]+')) + opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection(1 + (i2 + i1 * ('+inddims_staged[g_m]+')))') + ENDDO() + ENDDO() + if accs[invinds_staged[g_m]] == OP_RW or accs[invinds_staged[g_m]] == OP_WRITE: + DO('i1','0','opDat'+str(invinds_staged[g_m]+1)+'SharedIndirectionSize') + DO('i2','0',inddims_staged[g_m]) + code('opDat'+str(invinds_staged[g_m]+1)+'(i2 + opDat'+str(invinds_staged[g_m]+1)+'IndirectionMap(i1 + 1) * ('+inddims_staged[g_m]+')) = opDat'+str(invinds_staged[g_m]+1)+'SharedIndirection(1 + (i2 + i1 * ('+inddims_staged[g_m]+')))') + ENDDO() + ENDDO() + if indopts[g_m] > 0 and (accs[invinds_staged[g_m]]==OP_INC or accs[invinds_staged[g_m]]==OP_WRITE or accs[invinds_staged[g_m]]==OP_RW): + ENDIF() + + else: #direct kernel call + code('') + comm('kernel call') + DO('i1','sliceStart', 'sliceEnd') + line = 'CALL '+name+'( &' + indent = '\n'+' '*depth + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL: + line = line + indent +'& opDat'+str(g_m+1) + else: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + line = line + indent +'& opDat'+str(g_m+1)+'(i1 * ('+dims[g_m]+'))' + else: + line = line + indent +'& opDat'+str(g_m+1)+'(i1 * ('+dims[g_m]+'):i1 * ('+dims[g_m]+') + '+dims[g_m]+' - 1)' + if g_m < nargs-1: + line = line + ', &' + else: + line = line + ' &' + depth = depth - 2 + code(line + indent + '& )') + depth = depth + 2 + ENDDO() + + depth = depth - 2 + code('END SUBROUTINE') + code('') + +########################################################################## +# Generate OpenMP hust stub +########################################################################## + code('SUBROUTINE '+name+'_host( userSubroutine, set, &'); depth = depth + 2 + for g_m in range(0,nargs): + if g_m == nargs-1: + code('& opArg'+str(g_m+1)+' )') + else: + code('& opArg'+str(g_m+1)+', &') + + code('') + code('IMPLICIT NONE') + code('character(kind=c_char,len=*), INTENT(IN) :: userSubroutine') + code('type ( op_set ) , INTENT(IN) :: set') + code('') + + for g_m in range(0,nargs): + code('type ( op_arg ) , INTENT(IN) :: opArg'+str(g_m+1)) + code('') + + code('type ( op_arg ) , DIMENSION('+str(nargs)+') :: opArgArray') + code('INTEGER(kind=4) :: numberOfOpDats') + code('INTEGER(kind=4) :: n_upper') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayStart') + code('INTEGER(kind=4), DIMENSION(1:8) :: timeArrayEnd') + code('REAL(kind=8) :: startTime') + code('REAL(kind=8) :: endTime') + code('INTEGER(kind=4) :: returnSetKernelTiming') + code('type ( op_set_core ) , POINTER :: opSetCore') + code('') + + for g_m in range(0,ninds): + code('type ( op_set_core ) , POINTER :: opSet'+str(invinds[g_m]+1)+'Core') + code(typs[invinds[g_m]]+', POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'Cardinality') + if inds_staged[invinds[g_m]] == 0: + code('INTEGER(kind=4), POINTER, DIMENSION(:) :: opDat'+str(invinds[g_m]+1)+'Map') + code('INTEGER(kind=4) :: opDat'+str(invinds[g_m]+1)+'MapDim') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('type ( op_set_core ) , POINTER :: opSet'+str(g_m+1)+'Core') + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + code('') + if maps[g_m] == OP_GBL: + if (dims[g_m].isdigit()) and (int(dims[g_m]) == 1): + code(typs[g_m]+', POINTER :: opDat'+str(g_m+1)+'Local') + else: + code(typs[g_m]+', POINTER, DIMENSION(:) :: opDat'+str(g_m+1)+'Local') + code('INTEGER(kind=4) :: opDat'+str(g_m+1)+'Cardinality') + + code('') + for g_m in range(0,nargs): + code('type ( op_dat_core ) , POINTER :: opDat'+str(g_m+1)+'Core') + code('') + + if ninds > 0: + for g_m in range(0,nargs): + code('type ( op_map_core ) , POINTER :: opMap'+str(g_m+1)+'Core') + code('') + + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingIndicesArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: mappingArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: accessDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: indirectionDescriptorArray') + code('INTEGER(kind=4), DIMENSION(1:'+str(nargs)+') :: opDatTypesArray') + code('INTEGER(kind=4) :: numberOfIndirectOpDats') + code('INTEGER(kind=4) :: blockOffset') + code('INTEGER(kind=4) :: nblocks') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i2') + code('') + + else: + code('INTEGER(kind=4) :: threadID') + code('INTEGER(kind=4) :: numberOfThreads') + code('INTEGER(kind=4) :: sliceStart') + code('INTEGER(kind=4) :: sliceEnd') + code('INTEGER(kind=4) :: partitionSize') + code('INTEGER(kind=4) :: i1') + code('INTEGER(kind=4) :: i10') + code('INTEGER(kind=4) :: i11') + code('REAL(kind=4) :: dataTransfer') + + code('') + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code(typs[g_m]+', DIMENSION(:), ALLOCATABLE :: reductionArrayHost'+str(g_m+1)) + + if nopts>0: + code('INTEGER(kind=4) :: optflags') + code('optflags = 0') + for i in range(0,nargs): + if optflags[i] == 1: + IF('opArg'+str(i+1)+'%opt == 1') + code('optflags = IBSET(optflags,'+str(optidxs[i])+')') + ENDIF() + if nopts > 30: + print('ERROR: too many optional arguments to store flags in an integer') + + code('') + code('numberOfOpDats = '+str(nargs)) + code('') + + for g_m in range(0,nargs): + code('opArgArray('+str(g_m+1)+') = opArg'+str(g_m+1)) + code('') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + code('& 0.d0, 0.00000,0.00000, 0)') + + code('call op_timers_core(startTime)') + code('') + #mpi halo exchange call + code('n_upper = op_mpi_halo_exchanges(set%setCPtr,numberOfOpDats,opArgArray)') + + depth = depth - 2 + if ninds > 0: + code_pre('#ifdef OP_PART_SIZE_1') + code_pre(' partitionSize = OP_PART_SIZE_1') + code_pre('#else') + code_pre(' partitionSize = 0') + code_pre('#endif') + + code('') + code_pre('#ifdef _OPENMP') + code_pre(' numberOfThreads = omp_get_max_threads()') + code_pre('#else') + code_pre(' numberOfThreads = 1') + code_pre('#endif') + depth = depth + 2 + + + if ninds > 0: + for g_m in range(0,nargs): + code('indirectionDescriptorArray('+str(g_m+1)+') = '+str(inds[g_m]-1)) + code('') + + code('numberOfIndirectOpDats = '+str(ninds)) + code('') + code('planRet_'+name+' = FortranPlanCaller( &') + code('& userSubroutine//C_NULL_CHAR, &') + code('& set%setCPtr, &') + code('& partitionSize, &') + code('& numberOfOpDats, &') + code('& opArgArray, &') + code('& numberOfIndirectOpDats, &') + code('& indirectionDescriptorArray, 1)') + code('') + code('CALL c_f_pointer(planRet_'+name+',actualPlan_'+name+')') + code('CALL c_f_pointer(actualPlan_'+name+'%nindirect,pnindirect_'+name+',(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_maps,ind_maps_'+name+',(/actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%maps,mappingArray_'+name+',(/numberOfOpDats/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ncolblk,ncolblk_'+name+',(/actualPlan_'+name+'%ncolors_core/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_sizes,ind_sizes_'+name+',(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%ind_offs,ind_offs_'+name+',(/actualPlan_'+name+'%nblocks * actualPlan_'+name+'%ninds_staged/))') + code('CALL c_f_pointer(actualPlan_'+name+'%blkmap,blkmap_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%offset,offset_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nelems,nelems_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%nthrcol,nthrcol_'+name+',(/actualPlan_'+name+'%nblocks/))') + code('CALL c_f_pointer(actualPlan_'+name+'%thrcol,thrcol_'+name+',(/set%setPtr%size/))') + code('') + for g_m in range(0,ninds_staged): + code('CALL c_f_pointer(ind_maps_'+name+'('+str(g_m+1)+'),ind_maps'+str(invinds_staged[g_m]+1)+'_'+name+',(/pnindirect_'+name+'('+str(g_m+1)+')/))') + code('') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('CALL c_f_pointer(mappingArray_'+name+'('+str(g_m+1)+'),mappingArray'+str(g_m+1)+'_'+name+',(/set%setPtr%size/))') + code('') + + + code('') + code('opSetCore => set%setPtr') + code('') + for g_m in range(0,ninds): + code('opDat'+str(invinds[g_m]+1)+'Cardinality = opArg'+str(invinds[g_m]+1)+'%dim * getSetSizeFromOpArg(opArg'+str(invinds[g_m]+1)+')') + if inds_staged[invinds[g_m]] == 0: + code('opDat'+str(invinds[g_m]+1)+'MapDim = getMapDimFromOpArg(opArg'+str(invinds[g_m]+1)+')') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + elif maps[g_m] == OP_GBL: + if (dims[g_m].isdigit() == 0) or (int(dims[g_m]) > 1): + code('opDat'+str(g_m+1)+'Cardinality = opArg'+str(g_m+1)+'%dim') + + code('') + for g_m in range(0,ninds): + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%data,opDat'+str(invinds[g_m]+1)+'Local,(/opDat'+str(invinds[g_m]+1)+'Cardinality/))') + if inds_staged[invinds[g_m]] == 0: + code('CALL c_f_pointer(opArg'+str(invinds[g_m]+1)+'%map_data,opDat'+str(invinds[g_m]+1)+'Map,(/opSetCore%size*opDat'+str(invinds[g_m]+1)+'MapDim/))') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local,(/opDat'+str(g_m+1)+'Cardinality/))') + elif maps[g_m] == OP_GBL: + if dims[g_m].isdigit() and int(dims[g_m]) == 1: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local)') + else: + code('CALL c_f_pointer(opArg'+str(g_m+1)+'%data,opDat'+str(g_m+1)+'Local, (/opDat'+str(g_m+1)+'Cardinality/))') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('allocate( reductionArrayHost'+str(g_m+1)+'(numberOfThreads * (('+dims[g_m]+'-1)/64+1)*64) )') + DO('i10','1','numberOfThreads+1') + DO('i11','1',dims[g_m]+'+1') + code('reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i11) = 0') + ENDDO() + ENDDO() + + if ninds > 0: #indirect loop host stub call + code('blockOffset = 0') + code('') + DO('i1','0','actualPlan_'+name+'%ncolors') + + IF('i1 .EQ. actualPlan_'+name+'%ncolors_core') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + + code('nblocks = ncolblk_'+name+'(i1 + 1)') + code('!$OMP PARALLEL DO private (threadID)') + DO('i2','0','nblocks') + code('threadID = omp_get_thread_num()') + code('CALL op_x86_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,ninds): + code('& opDat'+str(invinds[g_m]+1)+'Local, &') + for g_m in range(0,ninds): + if inds_staged[invinds[g_m]] == 0: + code('& opDat'+str(invinds[g_m]+1)+'Map, &') + code('& opDat'+str(invinds[g_m]+1)+'MapDim, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + elif maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + elif maps[g_m] == OP_GBL and (accs[g_m] == OP_READ or accs[g_m] == OP_WRITE): + code('& opDat'+str(g_m+1)+'Local, &') + + for g_m in range(0,ninds_staged): + code('& ind_maps'+str(invinds_staged[g_m]+1)+'_'+name+', &') + for g_m in range(0,nargs): + if inds_staged[g_m] > 0: + code('& mappingArray'+str(g_m+1)+'_'+name+', &') + + code('& ind_sizes_'+name+', &') + code('& ind_offs_'+name+', &') + code('& blkmap_'+name+', &') + code('& offset_'+name+', &') + code('& nelems_'+name+', &') + code('& nthrcol_'+name+', &') + code('& thrcol_'+name+', &') + code('& blockOffset,i2)') + + ENDDO() + code('!$OMP END PARALLEL DO') + code('blockOffset = blockOffset + nblocks') + ENDDO() + code('') + + + else: #direct loop host stub call + code('!$OMP PARALLEL DO private (sliceStart,sliceEnd,i1,threadID)') + DO('i1','0','numberOfThreads') + code('sliceStart = opSetCore%size * i1 / numberOfThreads') + code('sliceEnd = opSetCore%size * (i1 + 1) / numberOfThreads') + code('threadID = omp_get_thread_num()') + code('CALL op_x86_'+name+'( &') + if nopts>0: + code('& optflags, &') + for g_m in range(0,nargs): + if maps[g_m] == OP_ID: + code('& opDat'+str(g_m+1)+'Local, &') + if maps[g_m] == OP_GBL and accs[g_m] == OP_INC: + code('& reductionArrayHost'+str(g_m+1)+'(threadID * (('+dims[g_m]+'-1)/64+1)*64 + 1), &') + elif maps[g_m] == OP_GBL and (accs[g_m] == OP_READ or accs[g_m] == OP_WRITE): + code('& opDat'+str(g_m+1)+'Local, &') + code('& sliceStart, &') + code('& sliceEnd)') + ENDDO() + code('!$OMP END PARALLEL DO') + + code('') + IF('(n_upper .EQ. 0) .OR. (n_upper .EQ. opSetCore%core_size)') + code('CALL op_mpi_wait_all(numberOfOpDats,opArgArray)') + ENDIF() + code('') + code('') + code('CALL op_mpi_set_dirtybit(numberOfOpDats,opArgArray)') + code('') + + #reductions + for g_m in range(0,nargs): + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX): + DO('i10','1','numberOfThreads+1') + if (not dims[g_m].isdigit()) or int(dims[g_m]) > 1: + DO('i11','1',dims[g_m]+'+1') + code('opDat'+str(g_m+1)+'Local(i11) = opDat'+str(g_m+1)+'Local(i11) + reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + i11)') + ENDDO() + else: + code('opDat'+str(g_m+1)+'Local = opDat'+str(g_m+1)+'Local + reductionArrayHost'+str(g_m+1)+'((i10 - 1) * (('+dims[g_m]+'-1)/64+1)*64 + 1)') + ENDDO() + code('') + code('deallocate( reductionArrayHost'+str(g_m+1)+' )') + code('') + if maps[g_m] == OP_GBL and (accs[g_m] == OP_INC or accs[g_m] == OP_MIN or accs[g_m] == OP_MAX or accs[g_m] == OP_WRITE): + if typs[g_m] == 'real(8)' or typs[g_m] == 'REAL(kind=8)': + code('CALL op_mpi_reduce_double(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'real(4)' or typs[g_m] == 'REAL(kind=4)': + code('CALL op_mpi_reduce_float(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'integer(4)' or typs[g_m] == 'INTEGER(kind=4)': + code('CALL op_mpi_reduce_int(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + elif typs[g_m] == 'logical' or typs[g_m] == 'logical*1': + code('CALL op_mpi_reduce_bool(opArg'+str(g_m+1)+',opArg'+str(g_m+1)+'%data)') + code('') + + code('call op_timers_core(endTime)') + code('') + if ninds == 0: + code('dataTransfer = 0.0') + for g_m in range(0,nargs): + if accs[g_m] == OP_READ: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+')') + else: + if maps[g_m] == OP_GBL: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * 2.d0') + else: + code('dataTransfer = dataTransfer + opArg'+str(g_m+1)+'%size * getSetSizeFromOpArg(opArg'+str(g_m+1)+') * 2.d0') + + code('returnSetKernelTiming = setKernelTime('+str(nk)+' , userSubroutine//C_NULL_CHAR, &') + + if ninds > 0: + code('& endTime-startTime, actualPlan_'+name+'%transfer,actualPlan_'+name+'%transfer2, 1)') + else: + code('& endTime-startTime, dataTransfer, 0.00000, 1)') + + + depth = depth - 2 + code('END SUBROUTINE') + code('END MODULE') + code('') + +########################################################################## +# output individual kernel file +########################################################################## + if hydra: + name = 'kernels/'+kernels[nk]['master_file']+'/'+name + fid = open(name+'_kernel.F95','w') + else: + fid = open(name+'_kernel.F90','w') + date = datetime.datetime.now() + fid.write('!\n! auto-generated by op2.py on '+date.strftime("%Y-%m-%d %H:%M")+'\n!\n\n') + fid.write(file_text.strip()) + fid.close() diff --git a/translator/fortran/util.py b/translator/fortran/util.py new file mode 100644 index 000000000..e409e03a8 --- /dev/null +++ b/translator/fortran/util.py @@ -0,0 +1,581 @@ +#!/usr/bin/env python + +""" + OP2 source code transformation tool + + This tool parses the user's original source code to produce + target-specific code to execute the user's kernel functions. + + This prototype is written in Python and is directly based on the + parsing and code generation of the matlab source code transformation code + + usage: ./op2_fortran.py 'file1','file2',... + + This code generator is for parsing applications written using the OP2 FORTRAN API + + This takes as input + + file1.F90, file2.F90, ... + + and produces as output modified versions ..... + + file1_op.F90, file2_op.F90, ... + + then calls a number of target-specific code generators + to produce individual kernel files of the form + + xxx_kernel.F90 -- for OpenMP x86 execution + xxx_kernel.CUF -- for CUDA execution (based on PGI CUDA FORTRAN) + +""" + +import sys +import re +import os +def arg_parse(text,j): + + depth = 0 + loc2 = j; + while 1: + if text[loc2] == '(': + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + return loc2 + loc2 = loc2 + 1 + +def arg_parse2(text, j): + """Parsing arguments in op_par_loop to find the correct closing brace""" + + depth = 0 + loc2 = j + arglist = [] + prev_start = j + while 1: + if text[loc2] == '(': + if depth == 0: + prev_start = loc2+1 + depth = depth + 1 + + elif text[loc2] == ')': + depth = depth - 1 + if depth == 0: + arglist.append(text[prev_start:loc2].replace('&','').strip()) + return arglist + + elif text[loc2] == ',': + if depth == 1: + arglist.append(text[prev_start:loc2].replace('&','').strip()) + prev_start = loc2+1 + elif text[loc2] == '{': + depth = depth + 1 + elif text[loc2] == '}': + depth = depth - 1 + loc2 = loc2 + 1 + + +const_list = [] + +def replace_consts(text): + global const_list + i = text.find('use HYDRA_CONST_MODULE') + if i > -1: + fi2 = open("hydra_constants_list.txt","r") + for line in fi2: + fstr = '\\b'+line[:-1]+'\\b' + rstr = line[:-1]+'_OP2CONSTANT' + j = re.search(fstr,text,re.IGNORECASE) + if not (j is None) and not (line[:-1] in const_list): + const_list = const_list + [line[:-1]] + text = re.sub(fstr,rstr,text) + return text + +def replace_npdes(text): + # + # substitute npdes with DNPDE + # + i = re.search('\\bnpdes\\b',text) + if not (i is None): + j = i.start() + i = re.search('\\bnpdes\\b',text[j:]) + j = j + i.start()+5 + i = re.search('\\bnpdes\\b',text[j:]) + j = j + i.start()+5 + text = text[0:j] + re.sub('\\bnpdes\\b','NPDE',text[j:]) + return text + +funlist = [] +funlist2 = [] + +def get_stride_string(g_m,maps,stride,set_name): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + if maps[g_m] == OP_ID: + return 'direct_stride_OP2CONSTANT' + if maps[g_m] == OP_GBL: + return '(gridDim%x*blockDim%x)' + else: + idx = stride[g_m] + return 'opDat'+str(idx+1)+'_stride_OP2CONSTANT' + +def get_stride_string_mapnames(g_m,maps,mapnames,set_name): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + if maps[g_m] == OP_ID: + return 'direct_stride_OP2CONSTANT' + if maps[g_m] == OP_GBL: + return '(gridDim%x*blockDim%x)' + else: + idx = mapnames.index(mapnames[g_m]) + return 'opDat'+str(idx+1)+'_stride_OP2CONSTANT' + +def replace_soa_subroutines(funcs,idx,soaflags,maps,accs,mapnames,repl_inc,hydra,bookleaf,unknown_size_red,stride=[],atomics=0): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + name = funcs[idx]['function_name'] + if len(stride)==0: + stride = [0]*len(funcs[idx]['args']) + for g_m in range(0,len(funcs[idx]['args'])): + if g_m >= len(maps): + print(name, maps, funcs[idx]['args']) + if maps[g_m] == OP_MAP: + stride[g_m] = mapnames.index(mapnames[g_m]) + if funcs[idx]['soa_converted'] == 0: + funcs[idx]['soaflags'] = soaflags + if atomics or (1 in unknown_size_red): + funcs[idx]['function_text'] = replace_atomics(funcs[idx]['function_text'], + len(funcs[idx]['args']), + funcs[idx]['args'], name, maps, accs, '', mapnames, repl_inc, hydra, bookleaf,unknown_size_red, stride,atomics) + funcs[idx]['function_text'] = replace_soa(funcs[idx]['function_text'], + len(funcs[idx]['args']), + soaflags, name, maps, accs, '', mapnames, repl_inc, hydra, bookleaf,stride,atomics) + funcs[idx]['soa_converted'] = 1 + for idx_funcall in range(0,len(funcs[idx]['calls'])): + funcall = funcs[idx]['calls'][idx_funcall] + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + nargs = len(funcall['args']) + call_name = funcall['function_name'] + idx_called = -1 + for i in range(0,len(funcs)): + if funcs[i]['function_name'].lower()==call_name.lower(): + idx_called = i + if idx_called == -1: + print('ERROR, subroutine not found in replace_soa_subroutines: ' + call_name) + if len(funcs[idx_called]['args'])!=int(nargs): + print('ERROR: number of arguments of function '+call_name+' is '+str(len(funcs[idx_called]['args']))+' but trying to pass '+str(nargs)) + soaflags2 = [0]*nargs + stride2 = [0]*nargs + maps2 = [0]*nargs + accs2 = [0]*nargs + for i in range(0,nargs): + arg = funcall['args'][i] + #strip indexing + j = arg.find('(') + if j > 0: + arg = arg[0:j] + if arg in funcs[idx]['args']: + orig_idx = funcs[idx]['args'].index(arg) + soaflags2[i] = soaflags[orig_idx] + maps2[i] = maps[orig_idx] + accs2[i] = accs[orig_idx] + stride2[i] = stride[orig_idx] + funcs[idx]['calls'][idx_funcall]['soaflags']=soaflags2 + if funcs[idx_called]['soa_converted'] == 1 and soaflags2 != funcs[idx_called]['soaflags']: + print('WARNING: soaflags mismatch for repeated function call: ' + funcs[idx_called]['function_name']) + print(funcs[idx_called]['soaflags'], soaflags2) + if funcs[idx_called]['soa_converted'] == 1: + continue + funcs = replace_soa_subroutines(funcs,idx_called,soaflags2,maps2,accs2,mapnames,repl_inc,hydra,bookleaf,unknown_size_red,stride2,atomics) + return funcs + +def replace_atomics(text,nargs,varlist,name,maps,accs,set_name,mapnames,repl_inc,hydra,bookleaf,unknown_size_red,stride=[],atomics=0): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + need_istat = 0 + for g_m in range(0,nargs): + if (maps[g_m]==OP_MAP and accs[g_m]==OP_INC and atomics) or (maps[g_m]==OP_GBL and accs[g_m]!=OP_READ and unknown_size_red[g_m]==1): +# print(g_m) +# print(maps) +# print(accs) + p = re.compile('\\b'+varlist[g_m]+'\\b') + nmatches = len(p.findall(text)) + loc1 = 0; + idd = 0 + while idd < nmatches: + loc1 = loc1 + p.search(text[loc1:]).start() + if idd < 2: #variable declaration in function name, and its type + m = p.search(text[loc1+ len(varlist[g_m]):]) + if m: + loc1 = loc1 + len(varlist[g_m]) + m.start() + idd = idd + 1 + continue + else: + break + + #look for a = a + x expression and swap it for istat = atomicInc(a,x) + line,beg,end = get_full_line(text,loc1) + if len(line.strip())>4 and line.strip().lower()[0:5]=='call ': + end=beg #this is a function call, should mnot substitute with atomic add + loc1 = loc1 + len(varlist[g_m]) + else: + line = line.replace('&','') + line = line.replace('\n','') + j = line.find('=') + expr = re.escape(line[0:j].strip()) + if accs[g_m] == OP_INC: + m = re.search(expr+r'\s*=\s*'+expr+r'\s*(\+|\-)(.*)',line) + if m is None: + print('ERROR in '+name+' variable '+varlist[g_m]+' supposed to be OP_INC, but seems not to be: '+line) + text = text[:loc1] + 'istat = atomicAdd('+line[0:j].strip()+','+ m.group(1)+ m.group(2)+')' + text[end:] + elif accs[g_m] == OP_MIN: + m = re.search(expr+r'\s*=\s*min\('+expr+r'\s*,(.*)',line) + if m is None: + print('ERROR in '+name+' variable '+varlist[g_m]+' supposed to be OP_MIN, but seems not to be: '+line) + text = text[:loc1] + 'istat = atomicMin('+line[0:j].strip()+','+ m.group(1) + text[end:] + elif accs[g_m] == OP_MAX: + m = re.search(expr+r'\s*=\s*max\('+expr+r'\s*,(.*)',line) + if m is None: + print('ERROR in '+name+' variable '+varlist[g_m]+' supposed to be OP_MIN, but seems not to be: '+line) + text = text[:loc1] + 'istat = atomicMax('+line[0:j].strip()+','+ m.group(1) + text[end:] + + idd = idd + 1 + loc1 = end + need_istat = 1 + idd = idd + 1 + if need_istat: + k = re.search(r'implicit\s*none',text,re.IGNORECASE) + if k is not None: + k2 = k.start()+text[k.start():].find('\n') + else: + k2 = endj+text[endj:].find('\n') + text=text[0:k2+1]+' integer*4 istat\n'+text[k2+1:] + return text + + + + +def replace_soa(text,nargs,soaflags,name,maps,accs,set_name,mapnames,repl_inc,hydra,bookleaf,stride=[],atomics=0): + OP_ID = 1; OP_GBL = 2; OP_MAP = 3; + + OP_READ = 1; OP_WRITE = 2; OP_RW = 3; + OP_INC = 4; OP_MAX = 5; OP_MIN = 6; + + if len(stride)==0: + stride = [0]*nargs + for g_m in range(0,nargs): + if maps[g_m] == OP_MAP: + stride[g_m] = mapnames.index(mapnames[g_m]) + # + # Apply SoA to variable accesses + # + j = text.find(name) + endj = arg_parse(text,j) + while text[j] != '(': + j = j + 1 + arg_list = text[j+1:endj] + arg_list = arg_list.replace('&','') + varlist = ['']*nargs + leading_dim = [-1]*nargs + follow_dim = ['-1']*nargs + + for g_m in range(0,nargs): + varlist[g_m] = arg_list.split(',')[g_m].strip() + for g_m in range(0,nargs): + if soaflags[g_m] and (repl_inc or (not (maps[g_m]==OP_MAP and accs[g_m]==OP_INC))): + #Start looking for the variable in the code, after the function signature + loc1 = endj + p = re.compile('\\b'+varlist[g_m]+'\\b') + nmatches = len(p.findall(text[loc1:])) + idd = 0 + while idd < nmatches: + idd = idd + 1 +# for id in range(0,nmatches): + #Search for the next occurence + i = p.search(text[loc1:]) + #Skip commented out ones + j = text[:loc1+i.start()].rfind('\n') + if j > -1 and text[j:loc1+i.start()].find('!')>-1: + loc1 = loc1+i.end() + continue + + #check to see if label is followed by array subscript + idx = loc1 + i.end() + while text[idx].isspace(): + idx = idx+1 + + #normal subscript access or shape definition with varname(size) + if text[idx] == '(': + #opening bracket + beginarg = idx+1 + #Find closing bracket + endarg = arg_parse(text,loc1+i.start()) + #looking for shape definition, look backward for DIMENSION(size) + elif leading_dim[g_m] == -1: + j = text[:loc1+i.start()].rfind('\n') + j2 = loc1+i.start() + while text[j+1:j2].strip()[0] == '&': + j2 = j-1 + j = text[:j2].rfind('\n') + + k = text[j:j2].lower().find('dimension') + if k != -1: + beginarg = j+k+text[j+k:j2].find('(')+1 + endarg = arg_parse(text,beginarg-1) + else: + print('Could not find shape specification for '+varlist[g_m]+' in '+name+'- assuming scalar') + soaflags[g_m] = 0 + beginarg = loc1 + i.end() + endarg=beginarg + break + else: + #check if this is in a subroutine call + newl = text[:loc1+i.start()].rfind('\n') + while (len(text[newl+1:loc1+i.start()].strip())>0 and (text[newl+1:loc1+i.start()].strip()[0]=='&' or text[newl+1:loc1+i.start()].strip()[0]=='!')): + newl = text[:newl].rfind('\n') + res=re.search('\\bcall\\b',text[newl:loc1+i.start()],re.IGNORECASE) + if res is None and text[loc1+i.start()-1]!='%': + print('Warning: array subscript not found for ' + varlist[g_m] + ' in '+name) + loc1 = loc1+i.start() + len(varlist[g_m]) + continue + + + #If this is the first time we see the argument (i.e. its declaration) + if leading_dim[g_m] == -1: + if (len(text[beginarg:endarg].split(',')) > 1): + #if it's 2D, remember leading dimension, and make it 1D + leading_dim[g_m] = text[beginarg:endarg].split(',')[0] + if '0:' in text[beginarg:endarg].split(',')[1]: + follow_dim[g_m] = '' +# follow_dim[g_m] = text[beginarg:endarg].split(',')[1].split(':')[0] + text = text[:beginarg] + '*'+' '*(endarg-beginarg-1) + text[endarg:] + elif beginarg==endarg: + leading_dim[g_m] = 0 + else: + leading_dim[g_m] = 1 + #Continue search after this instance of the variable + loc1 = loc1+i.start()+len(varlist[g_m]) + else: + #If we have seen this variable already, then it's in the actual code, replace it with macro + macro = 'OP2_SOA('+text[loc1+i.start():loc1+i.end()]+',' + if leading_dim[g_m] == 1: + macro = macro + text[beginarg:endarg] + elif leading_dim[g_m] == 0: + if beginarg==endarg: + loc1 = loc1+i.start() + len(varlist[g_m]) + else: + print('Warning: '+varlist[g_m]+' in '+name+' was assumed scalar, but now accessed as array: '+text[beginarg:endarg]) + macro = macro + text[beginarg:endarg] + else: + macro = macro + text[beginarg:endarg].split(',')[0] + '+('+text[beginarg:endarg].split(',')[1]+follow_dim[g_m]+')*('+leading_dim[g_m]+')' + macro = macro + ', ' + get_stride_string(g_m,maps,stride,set_name) + ')' + text = text[:loc1+i.start()] + macro + text[endarg+1:] + #Continue search after this instance of the variable + loc1 = loc1+i.start() + len(macro) + + + return text + +def convert_F90(text): + text = re.sub(r'\nc','\n!',text) + text = re.sub(r'\n &','&\n &',text) + return text + +#i may point into the middle of the line... +def comment_line(text, i): + orig_i = i + linebegin = text[0:i].rfind('\n') + lineend = i+text[i:].find('\n') + line = text[linebegin:lineend] + #comment this line, shift indices: + text = text[0:linebegin+1]+'!'+text[linebegin+1:] + lineend = lineend+1 + i=i+1 + if len(line.strip())>0 and line.strip()[0]=='&': + #keep going backwards + b_lineend = linebegin + b_linebegin = text[0:b_lineend].rfind('\n') + line = text[b_linebegin:b_lineend] + text = text[0:b_linebegin+1]+'!'+text[b_linebegin+1:] + lineend = lineend+1 + linebegin = linebegin+1 + b_lineend = b_lineend + 1 + i=i+1 + while len(line.strip())>0 and line.strip()[0]=='&': + b_lineend = b_linebegin + b_linebegin = text[0:b_lineend].rfind('\n') + line = text[b_linebegin:b_lineend] + text = text[0:b_linebegin+1]+'!'+text[b_linebegin+1:] + lineend = lineend+1 + linebegin = linebegin+1 + b_lineend = b_lineend + 1 + i=i+1 + nextline_end = lineend+1+text[lineend+1:].find('\n') + line = text[lineend:nextline_end] + while len(line.strip())>0 and line.strip()[0]=='&': + text = text[0:lineend+1]+'!'+text[lineend+1:] + lineend = nextline_end + 1 + nextline_end = lineend+1+text[lineend+1:].find('\n') + line = text[lineend:nextline_end] + return text, i-orig_i + +#i may point into the middle of the line... +def get_full_line(text, i): + orig_i = i + linebegin = text[0:i].rfind('\n') + lineend = i+text[i:].find('\n') + line = text[linebegin:lineend] + full_line = line + full_line_begin = linebegin + full_line_end = lineend + #comment this line, shift indices: + if len(line.strip())>0 and line.strip()[0]=='&': + #keep going backwards + b_lineend = linebegin + b_linebegin = text[0:b_lineend].rfind('\n') + line = text[b_linebegin:b_lineend] + full_line = line + full_line + full_line_begin = b_linebegin + while len(line.strip())>0 and line.strip()[0]=='&': + b_lineend = b_linebegin + b_linebegin = text[0:b_lineend].rfind('\n') + line = text[b_linebegin:b_lineend] + full_line = line + full_line + full_line_begin = b_linebegin + nextline_end = lineend+1+text[lineend+1:].find('\n') + line = text[lineend:nextline_end] + while len(line.strip())>0 and line.strip()[0]=='&': + full_line = full_line+line + full_line_end = nextline_end + text = text[0:lineend+1]+'!'+text[lineend+1:] + lineend = nextline_end + nextline_end = lineend+1+text[lineend+1:].find('\n') + line = text[lineend:nextline_end] + return full_line, full_line_begin, full_line_end + +def remove_jm76(text): + jm76_funs = ['SET_RGAS_RATIOS', 'INITGAS0', 'INITGAS1', 'INITGAS2', 'INITFUEL', 'INITVAP0', 'TOTALTP', 'TOTALP', 'TOTALT', 'STATICTP', 'QSTATICTP', 'USTATICTP', 'FLOWSPEED', 'DREALGA', 'SPECS', 'REALPHI', 'REALH', 'REALCP', 'REALRG', 'REALT', 'ISENT', 'ISENP'] + for fun in jm76_funs: + k = re.search(r'\n\s+.*\b'+fun+r'\b',text,re.IGNORECASE) + while not (k is None): + text,comm_inserted = comment_line(text,k.start()+1) + k = re.search(r'\n\s+.*\b'+fun+r'\b',text,re.IGNORECASE) + return text + +def get_kernel(text, name): + i = re.search(r'\n\s*\bsubroutine\b\s*'+name+r'\b', text, re.IGNORECASE) + if i: + #attempt 1: find end subroutine + j = re.search(r'\n\s*\bend\s+subroutine\b'+name+r'\b', text[i.start():], re.IGNORECASE) + if j: + return text[i.start():i.start()+j.end()] + #attempt 2: find next subroutine + j = re.search(r'\n\s*\bsubroutine\b', text[i.end():], re.IGNORECASE) + if j: + last_end = i.start()+[m.end() for m in re.finditer(r'\n\s*\bend\b',text[i.start():i.end()+j.start()], re.IGNORECASE)][-1] + return text[i.start():last_end+text[last_end:].find('\n')] + #attempt 3: end of file + last_end = i.start()+[m.end() for m in re.finditer(r'\n\s*\bend\b',text[i.start():], re.IGNORECASE)][-1] + return text[i.start():last_end+text[last_end:].find('\n')] + else: + return '' + + +def find_function_calls(text, attr, name=''): + global funlist + global funlist2 + text = remove_jm76(text) + j = re.search(r'\n\s*call hyd_',text,re.IGNORECASE) + while not (j is None): + text,comm_inserted = comment_line(text,j.start()+1) + j = re.search(r'\n\s*call hyd_',text,re.IGNORECASE) + j = re.search(r'\n\s*external',text,re.IGNORECASE) + while not (j is None): + text,comm_inserted = comment_line(text,j.start()+1) + j = re.search(r'\n\s*external',text,re.IGNORECASE) + j = re.search(r'\n\s*call op_',text,re.IGNORECASE) + while not (j is None): + text,comm_inserted = comment_line(text,j.start()+1) + j = re.search(r'\n\s*call op_',text,re.IGNORECASE) + j = re.search(r'\n\s*write\b',text,re.IGNORECASE) + while not (j is None): + text,comm_inserted = comment_line(text,j.start()+1) + j = re.search(r'\n\s*write\b',text,re.IGNORECASE) + + search_offset = 0 + my_subs = '' + children_subs='' + funlist_index = len(funlist2) + if name == '': + i = text.find('subroutine') + openbracket = i+text[i:].find('(') + name = text[i+len('subroutine'):openbracket].strip() + else: + i = text.find(name) + openbracket = i+text[i:].find('(') + funlist_entry = {'function_name' : name, + 'function_text' : text, + 'args' : arg_parse2(text,openbracket-1), + 'soa_converted' : 0, + 'calls': []} + funlist2 = funlist2 + [funlist_entry] + res=re.search('\\bcall\\b',text,re.IGNORECASE) + while (not (res is None)): + i = search_offset + res.start() + 4 + #Check if line is commented + j = text[:i].rfind('\n') + if j > -1 and text[j:i].find('!')>-1: + search_offset = i + res=re.search('\\bcall\\b',text[search_offset:],re.IGNORECASE) + continue + #find name: whatever is in front of opening bracket + openbracket = i+text[i:].find('(') + fun_name = text[i:openbracket].strip() + if 'hyd_' in fun_name: + print(text[j:openbracket]) + if fun_name.lower() in funlist: + funcall_entry = {'function_name': fun_name+'_gpu', + 'args' : arg_parse2(text,openbracket-1)} + funlist2[funlist_index]['calls'].append(funcall_entry) + search_offset = i + res=re.search('\\bcall\\b',text[search_offset:],re.IGNORECASE) + continue + + #print fun_name + + funlist = funlist + [fun_name.lower()] + funcall_entry = {'function_name': fun_name+'_gpu', + 'args' : arg_parse2(text,openbracket-1)} + funlist2[funlist_index]['calls'].append(funcall_entry) + #find signature + line = text[openbracket:openbracket+text[openbracket:].find('\n')].strip() + curr_pos = openbracket+text[openbracket:].find('\n')+1 + while (line[len(line)-1] == '&'): + line = text[curr_pos:curr_pos+text[curr_pos:].find('\n')].strip() + curr_pos = curr_pos+text[curr_pos:].find('\n')+1 + curr_pos = curr_pos-1 + arglist = text[openbracket:curr_pos] + #find the file containing the implementation + subr_file = os.popen('grep -Rilw --include "*.F90" --include "*.F" --exclude "*kernel.*" "subroutine '+fun_name+'\\b" . | head -n1').read().strip() + if (len(subr_file) == 0) or (not os.path.exists(subr_file)): + print('Error, subroutine '+fun_name+' implementation not found in files, check parser!') + exit(1) + #read the file and find the implementation + subr_fileh = open(subr_file,'r') + subr_fileh_text = subr_fileh.read() + if subr_file[len(subr_file)-1]=='F': + subr_fileh_text = convert_F90(subr_fileh_text) + subr_text = get_kernel(subr_fileh_text,fun_name) + #get rid of comments and realgas calls + subr_text = re.sub('\n*!.*\n','\n',subr_text) + subr_text = re.sub('!.*\n','\n',subr_text) + + if attr != '': + subr_text = replace_consts(subr_text) + subr_text = re.sub(r'(\n\s*)\bsubroutine\b\s+'+fun_name+r'\b', r'\1'+attr+' subroutine '+fun_name+'_gpu',subr_text,flags=re.IGNORECASE) + my_subs = my_subs + '\n' + subr_text + text1, text2 = find_function_calls(subr_text, attr, fun_name+'_gpu') + children_subs = children_subs + '\n' + text1 + search_offset = i + res=re.search('\\bcall\\b',text[search_offset:],re.IGNORECASE) + funlist2[funlist_index]['function_text']=text + return my_subs+children_subs, text