diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 45df5ed57..ab79c43fd 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -30,7 +30,7 @@ jobs: - name: Set correct Python version uses: actions/setup-python@v2 with: - python-version: '3.6' + python-version: '3.8' - name: Clone PETSc uses: actions/checkout@v2 diff --git a/pyop2/__init__.py b/pyop2/__init__.py index f0deef2e1..864b55af9 100644 --- a/pyop2/__init__.py +++ b/pyop2/__init__.py @@ -7,3 +7,8 @@ from pyop2._version import get_versions __version__ = get_versions()['version'] del get_versions + +from pyop2.configuration import configuration +from pyop2.compilation import max_simd_width +if configuration["vectorization_strategy"]: + configuration["simd_width"] = max_simd_width() diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c index 7f445d385..be8bafd6c 100644 --- a/pyop2/codegen/c/inverse.c +++ b/pyop2/codegen/c/inverse.c @@ -6,8 +6,10 @@ #define BUF_SIZE 30 static PetscBLASInt ipiv_buffer[BUF_SIZE]; static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE]; +static PetscScalar Aout_proxy_buffer[BUF_SIZE*BUF_SIZE]; #endif + #ifndef PYOP2_INV_LOG_EVENTS #define PYOP2_INV_LOG_EVENTS PetscLogEvent ID_inv_memcpy = -1; @@ -16,32 +18,58 @@ PetscLogEvent ID_inv_getri = -1; static PetscBool log_active_inv = 0; #endif -void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N) +static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N, + PetscBLASInt incA, PetscBLASInt incAout) { PetscLogIsActive(&log_active_inv); - if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);} PetscBLASInt info; PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv)); PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork)); - memcpy(Aout, A, N*N*sizeof(PetscScalar)); + + PetscInt N_sq = N * N; + PetscInt one = 1; + + // Aout_proxy: 'Aout', but stored contiguously + PetscScalar *Aout_proxy; + if (incAout == 1) + Aout_proxy = Aout; + else + { + // TODO: Must see if allocating has a significant performance impact + Aout_proxy = N_sq <= BUF_SIZE ? Aout_proxy_buffer : malloc(N*N*sizeof(*Aout)); + } + + if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);} + BLAScopy_(&N_sq, A, &incA, Aout_proxy, &one); if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);} if (log_active_inv){PetscLogEventBegin(ID_inv_getrf,0,0,0,0);} - LAPACKgetrf_(&N, &N, Aout, &N, ipiv, &info); + LAPACKgetrf_(&N, &N, Aout_proxy, &N, ipiv, &info); if (log_active_inv){PetscLogEventEnd(ID_inv_getrf,0,0,0,0);} if(info == 0){ if (log_active_inv){PetscLogEventBegin(ID_inv_getri,0,0,0,0);} - LAPACKgetri_(&N, Aout, &N, ipiv, Awork, &N, &info); + LAPACKgetri_(&N, Aout_proxy, &N, ipiv, Awork, &N, &info); if (log_active_inv){PetscLogEventEnd(ID_inv_getri,0,0,0,0);} + + // Copy Aout_proxy back to Aout + if (Aout != Aout_proxy) + { + if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);} + BLAScopy_(&N_sq, Aout_proxy, &one, Aout, &incAout); + if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);} + } } if(info != 0){ fprintf(stderr, "Getri throws nonzero info."); abort(); } - if ( N > BUF_SIZE ) { + + if (Awork != work_buffer) free(Awork); + if (ipiv != ipiv_buffer) free(ipiv); - } + if ((Aout_proxy != Aout) && (Aout_proxy != Aout_proxy_buffer)) + free(Aout_proxy); } diff --git a/pyop2/codegen/c/solve.c b/pyop2/codegen/c/solve.c index fbabc9588..c1444f1d1 100644 --- a/pyop2/codegen/c/solve.c +++ b/pyop2/codegen/c/solve.c @@ -8,6 +8,8 @@ static PetscBLASInt ipiv_buffer[BUF_SIZE]; static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE]; #endif +static PetscScalar out_proxy_buffer[BUF_SIZE]; + #ifndef PYOP2_SOLVE_LOG_EVENTS #define PYOP2_SOLVE_LOG_EVENTS PetscLogEvent ID_solve_memcpy = -1; @@ -16,15 +18,32 @@ PetscLogEvent ID_solve_getrs = -1; static PetscBool log_active_solve = 0; #endif -void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N) + +/* + * @param[incA]: Stride value while accessing elements of 'A'. + * @param[incB]: Stride value while accessing elements of 'B'. + * @param[incOut]: Stride value while accessing elements of 'out'. + */ +void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N, + PetscBLASInt incA, PetscBLASInt incB, PetscBLASInt incOut) { + PetscScalar* out_proxy; /// output laid-out with unit stride, expected by LAPACK + PetscInt N_sq = N*N; + PetscInt one = 1; PetscLogIsActive(&log_active_solve); if (log_active_solve){PetscLogEventBegin(ID_solve_memcpy,0,0,0,0);} PetscBLASInt info; PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv)); - memcpy(out,B,N*sizeof(PetscScalar)); - PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork)); - memcpy(Awork,A,N*N*sizeof(PetscScalar)); + + if (incOut == 1) + out_proxy = out; + else + out_proxy = (N <= BUF_SIZE) ? out_proxy_buffer : malloc(N*sizeof(*out)); + + BLAScopy_(&N, B, &incB, out_proxy, &one); + + PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N_sq*sizeof(*Awork)); + BLAScopy_(&N_sq, A, &incA, Awork, &one); if (log_active_solve){PetscLogEventEnd(ID_solve_memcpy,0,0,0,0);} PetscBLASInt NRHS = 1; @@ -35,7 +54,11 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con if(info == 0){ if (log_active_solve){PetscLogEventBegin(ID_solve_getrs,0,0,0,0);} - LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out, &N, &info); + LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out_proxy, &N, &info); + + if (out != out_proxy) + BLAScopy_(&N, out_proxy, &one, out, &incOut); + if (log_active_solve){PetscLogEventEnd(ID_solve_getrs,0,0,0,0);} } @@ -44,8 +67,12 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con abort(); } - if ( N > BUF_SIZE ) { - free(ipiv); - free(Awork); - } + if (ipiv != ipiv_buffer) + free(ipiv); + + if (Awork != work_buffer) + free(Awork); + + if ((out_proxy != out) && (out_proxy != out_proxy_buffer)) + free(out_proxy); } diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py index c083dd605..dfbd3a656 100644 --- a/pyop2/codegen/rep2loopy.py +++ b/pyop2/codegen/rep2loopy.py @@ -143,6 +143,7 @@ def with_types(self, arg_id_to_dtype, callables_table): callables_table) def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError assert self.is_ready_for_codegen() assert isinstance(insn, loopy.CallInstruction) @@ -151,6 +152,9 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper): parameters = list(parameters) par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)] + if expression_to_code_mapper.codegen_state.vectorization_info: + raise UnvectorizableError("LACallable: cannot take in vector arrays") + parameters.append(insn.assignees[-1]) par_dtypes.append(self.arg_id_to_dtype[0]) @@ -177,6 +181,46 @@ class INVCallable(LACallable): """ name = "inverse" + def with_descrs(self, arg_id_to_descr, callables_table): + a_descr = arg_id_to_descr.get(0) + a_inv_descr = arg_id_to_descr.get(-1) + + if a_descr is None or a_inv_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert len(a_descr.shape) == 2 + assert a_descr.shape == a_inv_descr.shape + assert a_descr.shape[1] == a_descr.shape[0] + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError + + # Override codegen to emit stride info. to the blas calls. + in_descr = self.arg_id_to_descr[0] + out_descr = self.arg_id_to_descr[-1] + ecm = expression_to_code_mapper + + # see pyop2/codegen/c/inverse.c for the func. signature + inc_a = in_descr.dim_tags[1].stride + inc_a_out = out_descr.dim_tags[1].stride + n = in_descr.shape[0] + + a, = insn.expression.parameters + a_out, = insn.assignees + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize 'inverse'.") + + c_parameters = [ecm(a_out).expr, + ecm(a).expr, + n, + inc_a, + inc_a_out] + return var(self.name_in_target)(*c_parameters), False + def generate_preambles(self, target): assert isinstance(target, type(target)) yield ("inverse", inverse_preamble) @@ -189,19 +233,65 @@ class SolveCallable(LACallable): """ name = "solve" + def with_descrs(self, arg_id_to_descr, callables_table): + a_descr = arg_id_to_descr.get(0) + b_descr = arg_id_to_descr.get(1) + x_descr = arg_id_to_descr.get(-1) + + if a_descr is None or b_descr is None: + # shapes aren't specialized enough to be resolved + return self, callables_table + + assert len(a_descr.shape) == 2 + assert len(x_descr.shape) == 1 + assert b_descr.shape == x_descr.shape + + return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table + + def emit_call_insn(self, insn, target, expression_to_code_mapper): + from loopy.codegen import UnvectorizableError + + # Override codegen to emit stride info. to the blas calls. + a_descr = self.arg_id_to_descr[0] + b_descr = self.arg_id_to_descr[1] + out_descr = self.arg_id_to_descr[-1] + ecm = expression_to_code_mapper + + # see pyop2/codegen/c/solve.c for the func. signature + inc_a = a_descr.dim_tags[1].stride + inc_b = b_descr.dim_tags[0].stride + inc_out = out_descr.dim_tags[0].stride + n = a_descr.shape[0] + + a, b = insn.expression.parameters + out, = insn.assignees + + if ecm.codegen_state.vectorization_info is not None: + raise UnvectorizableError("cannot vectorize 'inverse'.") + + c_parameters = [ecm(out).expr, + ecm(a).expr, + ecm(b).expr, + n, + inc_a, + inc_b, + inc_out] + return var(self.name_in_target)(*c_parameters), False + def generate_preambles(self, target): assert isinstance(target, type(target)) yield ("solve", solve_preamble) class _PreambleGen(ImmutableRecord): - fields = set(("preamble", )) + fields = {"preamble", "idx"} - def __init__(self, preamble): + def __init__(self, preamble, idx="0"): self.preamble = preamble + self.idx = idx def __call__(self, preamble_info): - yield ("0", self.preamble) + yield (self.idx, self.preamble) class PyOP2KernelCallable(loopy.ScalarCallable): @@ -537,7 +627,9 @@ def renamer(expr): options=options, assumptions=assumptions, lang_version=(2018, 2), - name=wrapper_name) + name=wrapper_name, + # TODO, should these really be silenced? + silenced_warnings=["write_race*", "data_dep*"]) # prioritize loops for indices in context.index_ordering: diff --git a/pyop2/compilation.py b/pyop2/compilation.py index ecca43187..dfc2c2662 100644 --- a/pyop2/compilation.py +++ b/pyop2/compilation.py @@ -46,6 +46,7 @@ from pyop2.mpi import MPI, collective, COMM_WORLD from pyop2.mpi import dup_comm, get_compilation_comm, set_compilation_comm +from pyop2.caching import cached from pyop2.configuration import configuration from pyop2.logger import warning, debug, progress, INFO from pyop2.exceptions import CompilationError @@ -458,14 +459,14 @@ class MacClangCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall", "-framework", "Accelerate") _ldflags = ("-dynamiclib",) - _optflags = ("-O3", "-ffast-math", "-march=native") + _optflags = ("-O3", "-ffast-math", "-march=native", "-fopenmp-simd") _debugflags = ("-O0", "-g") class MacClangARMCompiler(MacClangCompiler): """A compiler for building a shared library on ARM based Mac systems.""" # See https://stackoverflow.com/q/65966969 - _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14") + _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14", "-fopenmp-simd") # Need to pass -L/opt/homebrew/opt/gcc/lib/gcc/11 to prevent linker error: # ld: file not found: @rpath/libgcc_s.1.1.dylib for architecture arm64 This # seems to be a homebrew configuration issue somewhere. Hopefully this @@ -486,7 +487,7 @@ class LinuxGnuCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall") _ldflags = ("-shared",) - _optflags = ("-march=native", "-O3", "-ffast-math") + _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp") _debugflags = ("-O0", "-g") def sniff_compiler_version(self, cpp=False): @@ -543,7 +544,7 @@ class LinuxClangCompiler(Compiler): _cxxflags = ("-fPIC", "-Wall") _ldflags = ("-shared", "-L/usr/lib") - _optflags = ("-march=native", "-O3", "-ffast-math") + _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp-simd") _debugflags = ("-O0", "-g") @@ -558,7 +559,7 @@ class LinuxIntelCompiler(Compiler): _cxxflags = ("-fPIC", "-no-multibyte-chars") _ldflags = ("-shared",) - _optflags = ("-Ofast", "-xHost") + _optflags = ("-Ofast", "-xHost", "-qopenmp-simd") _debugflags = ("-O0", "-g") @@ -696,3 +697,23 @@ def clear_cache(prompt=False): shutil.rmtree(cachedir) else: print("Not removing cached libraries") + + +@cached(cache={}) +def max_simd_width(): + prg_str = '''#include + +int get_simd_width(){ + return __builtin_cpu_supports("avx512f") ? 8: + __builtin_cpu_supports("avx") ? 4: + __builtin_cpu_supports("sse") ? 2: + 1; +} +''' + try: + simd_width = load(prg_str, "c", "get_simd_width", restype=ctypes.c_int) + width = simd_width() + except (OSError, CompilationError): + warning("Cannot sniff SIMD width, using default of 4 doubles") + width = 4 + return width diff --git a/pyop2/configuration.py b/pyop2/configuration.py index 29717718c..b6c8dc9cd 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -74,6 +74,13 @@ class Configuration(dict): cdim > 1 be built as block sparsities, or dof sparsities. The former saves memory but changes which preconditioners are available for the resulting matrices. (Default yes) + :param vectorization_strategy: A :class:`str` describing the + vectorization strategy that must to be applied to the kernels. Can + be one of the following -- + - ``cross-element``: Cross-element vectorization strategy of + ``__. + :param alignment: A :class:`int` which specifies a size to which all temporaries + are aligned in memory. """ # name, env variable, type, default, write once cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid()) @@ -92,6 +99,12 @@ class Configuration(dict): ("PYOP2_LDFLAGS", str, ""), "simd_width": ("PYOP2_SIMD_WIDTH", int, 4), + "extra_info": + ("PYOP2_EXTRA_INFO", bool, False), + "vectorization_strategy": + ("PYOP2_VECT_STRATEGY", str, "cross-element"), + "alignment": + ("PYOP2_ALIGNMENT", int, 64), "debug": ("PYOP2_DEBUG", bool, False), "compute_kernel_flops": diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py index 6ef49dfa6..776ddd8f4 100644 --- a/pyop2/global_kernel.py +++ b/pyop2/global_kernel.py @@ -15,6 +15,7 @@ from pyop2.datatypes import IntType, as_ctypes from pyop2.types import IterationRegion from pyop2.utils import cached_property, get_petsc_dir +from pyop2 import op2 # We set eq=False to force identity-based hashing. This is required for when @@ -211,7 +212,7 @@ def __len__(self): @property def cache_key(self): - return tuple(a.cache_key for a in self.arguments) + return tuple(a.cache_key for a in self.arguments) + tuple(configuration["vectorization_strategy"]) @property def maps(self): @@ -252,7 +253,8 @@ class GlobalKernel(Cached): @classmethod def _cache_key(cls, local_knl, arguments, **kwargs): key = [cls, local_knl.cache_key, - *kwargs.items(), configuration["simd_width"]] + *kwargs.items(), configuration["simd_width"], + configuration["vectorization_strategy"]] key.extend([a.cache_key for a in arguments]) @@ -345,8 +347,62 @@ def builder(self): def code_to_compile(self): """Return the C/C++ source code as a string.""" from pyop2.codegen.rep2loopy import generate + from loopy.symbolic import get_dependencies + from functools import reduce wrapper = generate(self.builder) + if self._extruded: + iname = "layer" + else: + iname = "n" + + # TODO: vectorizing 2-form assembly kernels is possible, but must + # change the arguments passed to MatSetValuesxxx (not yet implemented) + has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg)) + for arg in self.arguments) + has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments) + is_cplx = (any(lp.types.NumpyType(dtype).is_complex() + if not isinstance(dtype, lp.types.NumpyType) + else dtype.is_complex() + for dtype in self.local_kernel.dtypes + ) # local args complex? + or any(arg.dtype.is_complex() + for arg in tuple(wrapper.default_entrypoint.temporary_variables.values()))) # global temps complex? + extruded_coords = self.local_kernel.name.endswith("extrusion") # FIXME is there a better way to know that this kernel generated the extrusion coords? + is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit) + vectorisable = is_loopy_kernel and ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords + + if vectorisable: + # change target to generate vectorized code via gcc vector + # extensions + wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget( + vec_fallback=lp.VectorizationFallback.OMP_SIMD + )) + # inline all inner kernels + names = self.local_kernel.code.callables_table + for name in names: + if (name in wrapper.callables_table.keys() + and isinstance(wrapper.callables_table[name], + lp.CallableKernel)): + wrapper = lp.inline_callable_kernel(wrapper, name) + + all_insn_preds = reduce( + frozenset.union, + (insn.predicates + for insn in wrapper.default_entrypoint.instructions), + frozenset()) + + if iname not in get_dependencies(tuple(all_insn_preds)): + # https://github.com/inducer/loopy/issues/615 + # TODO: get rid of this guard once the loopy issue is fixed + if configuration["vectorization_strategy"] == "cross-element": + wrapper = self.vectorise(wrapper, iname, + configuration["simd_width"]) + else: + raise NotImplementedError( + "Vectorization strategy" + f" '{configuration['vectorization_strategy']}'") + code = lp.generate_code_v2(wrapper) if self.local_kernel.cpp: @@ -354,8 +410,135 @@ def code_to_compile(self): preamble = "".join(process_preambles(getattr(code, "device_preambles", []))) device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs) return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n" + return code.device_code() + def vectorise(self, wrapper, iname, batch_size): + """Return a vectorised version of wrapper, vectorising over iname. + + :arg wrapper: A loopy kernel to vectorise. + :arg iname: The iteration index to vectorise over. + :arg batch_size: The vector width.""" + if batch_size == 1: + return wrapper + + from functools import reduce + import pymbolic.primitives as prim + + if not configuration["debug"]: + # loopy warns for every instruction that cannot be vectorized; + # ignore in non-debug mode. + new_entrypoint = wrapper.default_entrypoint.copy( + silenced_warnings=(wrapper.default_entrypoint.silenced_warnings + + ["vectorize_failed"])) + wrapper = wrapper.with_kernel(new_entrypoint) + + kernel = wrapper.default_entrypoint + + # {{{ get rid of noop insns + + from loopy.match import Id, Or + + noop_insn_ids = [Id(insn.id) + for insn in kernel.instructions + if isinstance(insn, lp.NoOpInstruction)] + kernel = lp.remove_instructions(kernel, Or(tuple(noop_insn_ids))) + + # }}} + + # align temps + alignment = configuration["alignment"] + tmps = {name: tv.copy(alignment=alignment) + for name, tv in kernel.temporary_variables.items()} + kernel = kernel.copy(temporary_variables=tmps) + + # {{{ record temps that cannot be vectorized + + # Do not vectorize temporaries used outside *iname* + temps_not_to_vectorize = reduce(set.union, + [(insn.dependency_names() + & frozenset(kernel.temporary_variables)) + for insn in kernel.instructions + if iname not in insn.within_inames], + set()) + + # Constant literal temporaries are arguments => cannot vectorize + temps_not_to_vectorize |= {name + for name, tv in kernel.temporary_variables.items() + if (tv.read_only + and tv.initializer is not None)} + + temps_not_to_vectorize |= {name + for name, tv in kernel.temporary_variables.items() + if kernel.writer_map().get(tv.name, set()) | kernel.reader_map().get(tv.name, set()) == set()} + + # {{{ clang (unlike gcc) does not allow taking address of vector-type + # variable + + # FIXME: Perform this only if we know we are not using gcc. + for insn in kernel.instructions: + if ( + isinstance(insn, lp.MultiAssignmentBase) + and isinstance(insn.expression, prim.Call) + and insn.expression.function.name in ["solve", "inverse"]): + temps_not_to_vectorize |= (insn.dependency_names()) + + # }}} + + # }}} + + # {{{ TODO: placeholder until loopy's simplify_using_pwaff gets smarter + + # transform to ensure that the loop undergoing array expansion has a + # lower bound of '0' + from loopy.symbolic import pw_aff_to_expr + lbound = pw_aff_to_expr(kernel.get_iname_bounds(iname).lower_bound_pw_aff) + shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift") + kernel = lp.affine_map_inames(kernel, iname, shifted_iname, + [(prim.Variable(shifted_iname), + (prim.Variable(iname) - lbound))]) + + # }}} + + # split iname + # note there is no front slab needed because iname is shifted (see above) + slabs = (0, 1) + inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch") + + kernel = lp.split_iname(kernel, shifted_iname, batch_size, slabs=slabs, + inner_iname=inner_iname) + + # adds a new axis to the temporary and indexes it with the provided iname + # i.e. stores the value at each instance of the loop. (i.e. array + # expansion) + kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname) + + # tag axes of the temporaries as vectorised + for name, tmp in kernel.temporary_variables.items(): + if name not in temps_not_to_vectorize: + tag = (len(tmp.shape)-1)*"c," + "vec" + kernel = lp.tag_array_axes(kernel, name, tag) + + # tag the inner iname as vectorized + kernel = lp.tag_inames(kernel, {inner_iname: "vec"}) + + # {{{ duplicate the inames surrounding the CInstructions with predicates + + cinsn_ids = [cinsn.id + for cinsn in kernel.instructions + if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)] + + for cinsn_id in cinsn_ids: + kernel = lp.duplicate_inames(kernel, + (inner_iname,), + within=f"id:{cinsn_id}", + tags={inner_iname: "unr"} + ) + + # }}} + + return wrapper.with_kernel(kernel) + @PETSc.Log.EventDecorator() @mpi.collective def compile(self, comm): diff --git a/pyop2/parloop.py b/pyop2/parloop.py index 2863ab88f..acd7b7f18 100644 --- a/pyop2/parloop.py +++ b/pyop2/parloop.py @@ -190,6 +190,10 @@ def _compute(self, part): :arg part: The :class:`SetPartition` to compute over. """ + if configuration["extra_info"]: + nbytes = self.comm.allreduce(self.nbytes) + if self.comm.Get_rank() == 0: + print("{0}_BYTES= {1}".format(self.global_kernel.name, nbytes)) with self._compute_event(): PETSc.Log.logFlops(part.size*self.num_flops) self.global_kernel(self.comm, part.offset, part.offset+part.size, *self.arglist) @@ -198,6 +202,25 @@ def _compute(self, part): def num_flops(self): return self.global_kernel.num_flops(self.iterset) + @cached_property + def nbytes(self): + nbytes = 0 + seen = set() + for lk_arg, gk_arg, pl_arg in self.zipped_arguments: + if lk_arg.access == Access.INC: + nbytes += pl_arg.data.nbytes * 2 + else: + nbytes += pl_arg.data.nbytes + for map_ in pl_arg.maps: + if map_ is None: + continue + for k in map_._kernel_args_: + if k in seen: + continue + nbytes += map_.values.nbytes + seen.add(k) + return nbytes + @mpi.collective def compute(self): # Parloop.compute is an alias for Parloop.__call__ diff --git a/requirements-git.txt b/requirements-git.txt index 438205043..0e5dce623 100644 --- a/requirements-git.txt +++ b/requirements-git.txt @@ -1,2 +1,2 @@ git+https://github.com/coneoproject/COFFEE.git#egg=coffee -git+https://github.com/firedrakeproject/loopy.git@main#egg=loopy +git+https://github.com/firedrakeproject/loopy.git@c_vecextensions_target_241022#egg=loopy diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py new file mode 100644 index 000000000..7425a9eae --- /dev/null +++ b/test/unit/test_vectorisation.py @@ -0,0 +1,98 @@ +# This file is part of PyOP2 +# +# PyOP2 is Copyright (c) 2012, Imperial College London and +# others. Please see the AUTHORS file in the main source directory for +# a full list of copyright holders. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# * The name of Imperial College London or that of other +# contributors may not be used to endorse or promote products +# derived from this software without specific prior written +# permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS +# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED +# OF THE POSSIBILITY OF SUCH DAMAGE. +import numpy as np +from pyop2 import op2 +from pyop2.configuration import configuration +import os +import pytest +from pyop2.parloop import LegacyParloop +from pyop2.types.glob import Global +from pyop2.types import Access + + +some_vectorisation_keys = ["__attribute__", "vector_size", "aligned", "#pragma omp simd"] + + +class TestVectorisation: + + @pytest.fixture + def s(self): + return op2.Set(1) + + @pytest.fixture + def d1(self, s): + return op2.Dat(s, [3], np.float64) + + @pytest.fixture + def d(self, s): + return op2.Dat(s, [4], np.float64) + + def inner(self, s, o): + s._check_shape(o) + ret = Global(1, data=0, dtype=s.dtype) + inner_parloop = LegacyParloop(s._inner_kernel(o.dtype), s.dataset.set, + s(Access.READ), o(Access.READ), ret(Access.INC)) + inner_parloop.compute() + return (inner_parloop.global_kernel, ret.data_ro[0]) + + def test_vectorisation(self, d1, d): + # Test that vectorised code produced the correct result + kernel1, ret = self.inner(d, d1) + assert abs(ret - 12) < 1e-12 + kernel2, ret = self.inner(d1, d) + assert abs(ret - 12) < 1e-12 + + # Test that we actually vectorised + assert all(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has not been succesfully vectorised." + assert all(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has not been succesfully vectorised." + + def test_no_vectorisation(self, d1, d): + # turn vectorisation off + configuration.reconfigure(vectorization_strategy="") + + # Test that unvectorised code produced the correct result + kernel1, ret = self.inner(d, d1) + assert abs(ret - 12) < 1e-12 + kernel2, ret = self.inner(d1, d) + assert abs(ret - 12) < 1e-12 + + # Test that we did not vectorise + assert not any(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has been vectorised even though we turned it off." + assert not any(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has been vectorised even though we turned it off." + + # change vect config back to be turned on by default + configuration.reconfigure(vectorization_strategy="cross-element") + + +if __name__ == '__main__': + pytest.main(os.path.abspath(__file__))