OP2 · sv2518 · Apr 11, 2019 · Aug 1, 2019 · Aug 4, 2019 · Apr 11, 2019
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Set correct Python version
         uses: actions/setup-python@v2
         with:
-          python-version: '3.6'
+          python-version: '3.8'
 
       - name: Clone PETSc
         uses: actions/checkout@v2

diff --git a/pyop2/__init__.py b/pyop2/__init__.py
@@ -7,3 +7,8 @@
 from pyop2._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
+
+from pyop2.configuration import configuration
+from pyop2.compilation import max_simd_width
+if configuration["vectorization_strategy"]:
+    configuration["simd_width"] = max_simd_width()
diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c
@@ -6,8 +6,10 @@
 #define BUF_SIZE 30
 static PetscBLASInt ipiv_buffer[BUF_SIZE];
 static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE];
+static PetscScalar Aout_proxy_buffer[BUF_SIZE*BUF_SIZE];
 #endif
 
+
 #ifndef PYOP2_INV_LOG_EVENTS
 #define PYOP2_INV_LOG_EVENTS
 PetscLogEvent ID_inv_memcpy = -1;
@@ -16,32 +18,58 @@ PetscLogEvent ID_inv_getri = -1;
 static PetscBool log_active_inv = 0;
 #endif
 
-void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N)
+static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N,
+                    PetscBLASInt incA, PetscBLASInt incAout)
 {
     PetscLogIsActive(&log_active_inv);
-    if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
     PetscBLASInt info;
     PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv));
     PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork));
-    memcpy(Aout, A, N*N*sizeof(PetscScalar));
+
+    PetscInt N_sq = N * N;
+    PetscInt one = 1;
+
+    // Aout_proxy: 'Aout', but stored contiguously
+    PetscScalar *Aout_proxy;
+    if (incAout == 1)
+      Aout_proxy = Aout;
+    else
+    {
+      // TODO: Must see if allocating has a significant performance impact
+      Aout_proxy = N_sq <= BUF_SIZE ? Aout_proxy_buffer : malloc(N*N*sizeof(*Aout));
+    }
+
+    if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
+    BLAScopy_(&N_sq, A, &incA, Aout_proxy, &one);
     if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);}
 
     if (log_active_inv){PetscLogEventBegin(ID_inv_getrf,0,0,0,0);}
-    LAPACKgetrf_(&N, &N, Aout, &N, ipiv, &info);
+    LAPACKgetrf_(&N, &N, Aout_proxy, &N, ipiv, &info);
     if (log_active_inv){PetscLogEventEnd(ID_inv_getrf,0,0,0,0);}
 
     if(info == 0){
         if (log_active_inv){PetscLogEventBegin(ID_inv_getri,0,0,0,0);}
-        LAPACKgetri_(&N, Aout, &N, ipiv, Awork, &N, &info);
+        LAPACKgetri_(&N, Aout_proxy, &N, ipiv, Awork, &N, &info);
         if (log_active_inv){PetscLogEventEnd(ID_inv_getri,0,0,0,0);}
+
+        // Copy Aout_proxy back to Aout
+        if (Aout != Aout_proxy)
+        {
+          if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
+          BLAScopy_(&N_sq, Aout_proxy, &one, Aout, &incAout);
+          if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);}
+        }
     }
 
     if(info != 0){
         fprintf(stderr, "Getri throws nonzero info.");
         abort();
     }
-    if ( N > BUF_SIZE ) {
+
+    if (Awork != work_buffer)
         free(Awork);
+    if (ipiv != ipiv_buffer)
         free(ipiv);
-    }
+    if ((Aout_proxy != Aout) && (Aout_proxy != Aout_proxy_buffer))
+        free(Aout_proxy);
 }
diff --git a/pyop2/codegen/c/solve.c b/pyop2/codegen/c/solve.c
@@ -8,6 +8,8 @@ static PetscBLASInt ipiv_buffer[BUF_SIZE];
 static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE];
 #endif
 
+static PetscScalar out_proxy_buffer[BUF_SIZE];
+
 #ifndef PYOP2_SOLVE_LOG_EVENTS
 #define PYOP2_SOLVE_LOG_EVENTS
 PetscLogEvent ID_solve_memcpy = -1;
@@ -16,15 +18,32 @@ PetscLogEvent ID_solve_getrs = -1;
 static PetscBool log_active_solve = 0;
 #endif
 
-void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N)
+
+/*
+ * @param[incA]: Stride value while accessing elements of 'A'.
+ * @param[incB]: Stride value while accessing elements of 'B'.
+ * @param[incOut]: Stride value while accessing elements of 'out'.
+ */
+void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N,
+           PetscBLASInt incA, PetscBLASInt incB, PetscBLASInt incOut)
 {
+    PetscScalar* out_proxy;  /// output laid-out with unit stride, expected by LAPACK
+    PetscInt N_sq = N*N;
+    PetscInt one = 1;
     PetscLogIsActive(&log_active_solve);
     if (log_active_solve){PetscLogEventBegin(ID_solve_memcpy,0,0,0,0);}
     PetscBLASInt info;
     PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv));
-    memcpy(out,B,N*sizeof(PetscScalar));
-    PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork));
-    memcpy(Awork,A,N*N*sizeof(PetscScalar));
+
+    if (incOut == 1)
+      out_proxy = out;
+    else
+      out_proxy = (N <= BUF_SIZE) ? out_proxy_buffer : malloc(N*sizeof(*out));
+
+    BLAScopy_(&N, B, &incB, out_proxy, &one);
+
+    PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N_sq*sizeof(*Awork));
+    BLAScopy_(&N_sq, A, &incA, Awork, &one);
     if (log_active_solve){PetscLogEventEnd(ID_solve_memcpy,0,0,0,0);}
 
     PetscBLASInt NRHS = 1;
@@ -35,7 +54,11 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con
 
     if(info == 0){
         if (log_active_solve){PetscLogEventBegin(ID_solve_getrs,0,0,0,0);}
-        LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out, &N, &info);
+        LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out_proxy, &N, &info);
+
+        if (out != out_proxy)
+            BLAScopy_(&N, out_proxy, &one, out, &incOut);
+
         if (log_active_solve){PetscLogEventEnd(ID_solve_getrs,0,0,0,0);}
     }
 
@@ -44,8 +67,12 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con
         abort();
     }
 
-    if ( N > BUF_SIZE ) {
-        free(ipiv);
-        free(Awork);
-    }
+    if (ipiv != ipiv_buffer)
+      free(ipiv);
+
+    if (Awork != work_buffer)
+      free(Awork);
+
+    if ((out_proxy != out) && (out_proxy != out_proxy_buffer))
+      free(out_proxy);
 }
diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
@@ -143,6 +143,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
                 callables_table)
 
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
         assert self.is_ready_for_codegen()
         assert isinstance(insn, loopy.CallInstruction)
 
@@ -151,6 +152,9 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper):
         parameters = list(parameters)
         par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
 
+        if expression_to_code_mapper.codegen_state.vectorization_info:
+            raise UnvectorizableError("LACallable: cannot take in vector arrays")
+
         parameters.append(insn.assignees[-1])
         par_dtypes.append(self.arg_id_to_dtype[0])
 
@@ -177,6 +181,46 @@ class INVCallable(LACallable):
     """
     name = "inverse"
 
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        a_descr = arg_id_to_descr.get(0)
+        a_inv_descr = arg_id_to_descr.get(-1)
+
+        if a_descr is None or a_inv_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert len(a_descr.shape) == 2
+        assert a_descr.shape == a_inv_descr.shape
+        assert a_descr.shape[1] == a_descr.shape[0]
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
+
+        # Override codegen to emit stride info. to the blas calls.
+        in_descr = self.arg_id_to_descr[0]
+        out_descr = self.arg_id_to_descr[-1]
+        ecm = expression_to_code_mapper
+
+        # see pyop2/codegen/c/inverse.c for the func. signature
+        inc_a = in_descr.dim_tags[1].stride
+        inc_a_out = out_descr.dim_tags[1].stride
+        n = in_descr.shape[0]
+
+        a, = insn.expression.parameters
+        a_out, = insn.assignees
+
+        if ecm.codegen_state.vectorization_info is not None:
+            raise UnvectorizableError("cannot vectorize 'inverse'.")
+
+        c_parameters = [ecm(a_out).expr,
+                        ecm(a).expr,
+                        n,
+                        inc_a,
+                        inc_a_out]
+        return var(self.name_in_target)(*c_parameters), False
+
     def generate_preambles(self, target):
         assert isinstance(target, type(target))
         yield ("inverse", inverse_preamble)
@@ -189,19 +233,65 @@ class SolveCallable(LACallable):
     """
     name = "solve"
 
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        a_descr = arg_id_to_descr.get(0)
+        b_descr = arg_id_to_descr.get(1)
+        x_descr = arg_id_to_descr.get(-1)
+
+        if a_descr is None or b_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert len(a_descr.shape) == 2
+        assert len(x_descr.shape) == 1
+        assert b_descr.shape == x_descr.shape
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
+
+        # Override codegen to emit stride info. to the blas calls.
+        a_descr = self.arg_id_to_descr[0]
+        b_descr = self.arg_id_to_descr[1]
+        out_descr = self.arg_id_to_descr[-1]
+        ecm = expression_to_code_mapper
+
+        # see pyop2/codegen/c/solve.c for the func. signature
+        inc_a = a_descr.dim_tags[1].stride
+        inc_b = b_descr.dim_tags[0].stride
+        inc_out = out_descr.dim_tags[0].stride
+        n = a_descr.shape[0]
+
+        a, b = insn.expression.parameters
+        out, = insn.assignees
+
+        if ecm.codegen_state.vectorization_info is not None:
+            raise UnvectorizableError("cannot vectorize 'inverse'.")
+
+        c_parameters = [ecm(out).expr,
+                        ecm(a).expr,
+                        ecm(b).expr,
+                        n,
+                        inc_a,
+                        inc_b,
+                        inc_out]
+        return var(self.name_in_target)(*c_parameters), False
+
     def generate_preambles(self, target):
         assert isinstance(target, type(target))
         yield ("solve", solve_preamble)
 
 
 class _PreambleGen(ImmutableRecord):
-    fields = set(("preamble", ))
+    fields = {"preamble", "idx"}
 
-    def __init__(self, preamble):
+    def __init__(self, preamble, idx="0"):
         self.preamble = preamble
+        self.idx = idx
 
     def __call__(self, preamble_info):
-        yield ("0", self.preamble)
+        yield (self.idx, self.preamble)
 
 
 class PyOP2KernelCallable(loopy.ScalarCallable):
@@ -537,7 +627,9 @@ def renamer(expr):
                                 options=options,
                                 assumptions=assumptions,
                                 lang_version=(2018, 2),
-                                name=wrapper_name)
+                                name=wrapper_name,
+                                # TODO, should these really be silenced?
+                                silenced_warnings=["write_race*", "data_dep*"])
 
     # prioritize loops
     for indices in context.index_ordering:

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
@@ -46,6 +46,7 @@
 
 from pyop2.mpi import MPI, collective, COMM_WORLD
 from pyop2.mpi import dup_comm, get_compilation_comm, set_compilation_comm
+from pyop2.caching import cached
 from pyop2.configuration import configuration
 from pyop2.logger import warning, debug, progress, INFO
 from pyop2.exceptions import CompilationError
@@ -458,14 +459,14 @@ class MacClangCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall", "-framework", "Accelerate")
     _ldflags = ("-dynamiclib",)
 
-    _optflags = ("-O3", "-ffast-math", "-march=native")
+    _optflags = ("-O3", "-ffast-math", "-march=native", "-fopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
 class MacClangARMCompiler(MacClangCompiler):
     """A compiler for building a shared library on ARM based Mac systems."""
     # See https://stackoverflow.com/q/65966969
-    _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14")
+    _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14", "-fopenmp-simd")
     # Need to pass -L/opt/homebrew/opt/gcc/lib/gcc/11 to prevent linker error:
     # ld: file not found: @rpath/libgcc_s.1.1.dylib for architecture arm64 This
     # seems to be a homebrew configuration issue somewhere. Hopefully this
@@ -486,7 +487,7 @@ class LinuxGnuCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall")
     _ldflags = ("-shared",)
 
-    _optflags = ("-march=native", "-O3", "-ffast-math")
+    _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp")
     _debugflags = ("-O0", "-g")
 
     def sniff_compiler_version(self, cpp=False):
@@ -543,7 +544,7 @@ class LinuxClangCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall")
     _ldflags = ("-shared", "-L/usr/lib")
 
-    _optflags = ("-march=native", "-O3", "-ffast-math")
+    _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
@@ -558,7 +559,7 @@ class LinuxIntelCompiler(Compiler):
     _cxxflags = ("-fPIC", "-no-multibyte-chars")
     _ldflags = ("-shared",)
 
-    _optflags = ("-Ofast", "-xHost")
+    _optflags = ("-Ofast", "-xHost", "-qopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
@@ -696,3 +697,23 @@ def clear_cache(prompt=False):
         shutil.rmtree(cachedir)
     else:
         print("Not removing cached libraries")
+
+
+@cached(cache={})
+def max_simd_width():
+    prg_str = '''#include <stdio.h>
+
+int get_simd_width(){
+    return __builtin_cpu_supports("avx512f") ? 8:
+        __builtin_cpu_supports("avx") ? 4:
+        __builtin_cpu_supports("sse") ? 2:
+        1;
+}
+'''
+    try:
+        simd_width = load(prg_str, "c", "get_simd_width", restype=ctypes.c_int)
+        width = simd_width()
+    except (OSError, CompilationError):
+        warning("Cannot sniff SIMD width, using default of 4 doubles")
+        width = 4
+    return width