diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 45df5ed57..ab79c43fd 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -30,7 +30,7 @@ jobs:
       - name: Set correct Python version
         uses: actions/setup-python@v2
         with:
-          python-version: '3.6'
+          python-version: '3.8'
 
       - name: Clone PETSc
         uses: actions/checkout@v2
diff --git a/pyop2/__init__.py b/pyop2/__init__.py
index f0deef2e1..864b55af9 100644
--- a/pyop2/__init__.py
+++ b/pyop2/__init__.py
@@ -7,3 +7,8 @@
 from pyop2._version import get_versions
 __version__ = get_versions()['version']
 del get_versions
+
+from pyop2.configuration import configuration
+from pyop2.compilation import max_simd_width
+if configuration["vectorization_strategy"]:
+    configuration["simd_width"] = max_simd_width()
diff --git a/pyop2/codegen/c/inverse.c b/pyop2/codegen/c/inverse.c
index 7f445d385..be8bafd6c 100644
--- a/pyop2/codegen/c/inverse.c
+++ b/pyop2/codegen/c/inverse.c
@@ -6,8 +6,10 @@
 #define BUF_SIZE 30
 static PetscBLASInt ipiv_buffer[BUF_SIZE];
 static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE];
+static PetscScalar Aout_proxy_buffer[BUF_SIZE*BUF_SIZE];
 #endif
 
+
 #ifndef PYOP2_INV_LOG_EVENTS
 #define PYOP2_INV_LOG_EVENTS
 PetscLogEvent ID_inv_memcpy = -1;
@@ -16,32 +18,58 @@ PetscLogEvent ID_inv_getri = -1;
 static PetscBool log_active_inv = 0;
 #endif
 
-void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N)
+static void inverse(PetscScalar* __restrict__ Aout, const PetscScalar* __restrict__ A, PetscBLASInt N,
+                    PetscBLASInt incA, PetscBLASInt incAout)
 {
     PetscLogIsActive(&log_active_inv);
-    if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
     PetscBLASInt info;
     PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv));
     PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork));
-    memcpy(Aout, A, N*N*sizeof(PetscScalar));
+
+    PetscInt N_sq = N * N;
+    PetscInt one = 1;
+
+    // Aout_proxy: 'Aout', but stored contiguously
+    PetscScalar *Aout_proxy;
+    if (incAout == 1)
+      Aout_proxy = Aout;
+    else
+    {
+      // TODO: Must see if allocating has a significant performance impact
+      Aout_proxy = N_sq <= BUF_SIZE ? Aout_proxy_buffer : malloc(N*N*sizeof(*Aout));
+    }
+
+    if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
+    BLAScopy_(&N_sq, A, &incA, Aout_proxy, &one);
     if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);}
 
     if (log_active_inv){PetscLogEventBegin(ID_inv_getrf,0,0,0,0);}
-    LAPACKgetrf_(&N, &N, Aout, &N, ipiv, &info);
+    LAPACKgetrf_(&N, &N, Aout_proxy, &N, ipiv, &info);
     if (log_active_inv){PetscLogEventEnd(ID_inv_getrf,0,0,0,0);}
 
     if(info == 0){
         if (log_active_inv){PetscLogEventBegin(ID_inv_getri,0,0,0,0);}
-        LAPACKgetri_(&N, Aout, &N, ipiv, Awork, &N, &info);
+        LAPACKgetri_(&N, Aout_proxy, &N, ipiv, Awork, &N, &info);
         if (log_active_inv){PetscLogEventEnd(ID_inv_getri,0,0,0,0);}
+
+        // Copy Aout_proxy back to Aout
+        if (Aout != Aout_proxy)
+        {
+          if (log_active_inv){PetscLogEventBegin(ID_inv_memcpy,0,0,0,0);}
+          BLAScopy_(&N_sq, Aout_proxy, &one, Aout, &incAout);
+          if (log_active_inv){PetscLogEventEnd(ID_inv_memcpy,0,0,0,0);}
+        }
     }
 
     if(info != 0){
         fprintf(stderr, "Getri throws nonzero info.");
         abort();
     }
-    if ( N > BUF_SIZE ) {
+
+    if (Awork != work_buffer)
         free(Awork);
+    if (ipiv != ipiv_buffer)
         free(ipiv);
-    }
+    if ((Aout_proxy != Aout) && (Aout_proxy != Aout_proxy_buffer))
+        free(Aout_proxy);
 }
diff --git a/pyop2/codegen/c/solve.c b/pyop2/codegen/c/solve.c
index fbabc9588..c1444f1d1 100644
--- a/pyop2/codegen/c/solve.c
+++ b/pyop2/codegen/c/solve.c
@@ -8,6 +8,8 @@ static PetscBLASInt ipiv_buffer[BUF_SIZE];
 static PetscScalar work_buffer[BUF_SIZE*BUF_SIZE];
 #endif
 
+static PetscScalar out_proxy_buffer[BUF_SIZE];
+
 #ifndef PYOP2_SOLVE_LOG_EVENTS
 #define PYOP2_SOLVE_LOG_EVENTS
 PetscLogEvent ID_solve_memcpy = -1;
@@ -16,15 +18,32 @@ PetscLogEvent ID_solve_getrs = -1;
 static PetscBool log_active_solve = 0;
 #endif
 
-void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N)
+
+/*
+ * @param[incA]: Stride value while accessing elements of 'A'.
+ * @param[incB]: Stride value while accessing elements of 'B'.
+ * @param[incOut]: Stride value while accessing elements of 'out'.
+ */
+void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, const PetscScalar* __restrict__ B, PetscBLASInt N,
+           PetscBLASInt incA, PetscBLASInt incB, PetscBLASInt incOut)
 {
+    PetscScalar* out_proxy;  /// output laid-out with unit stride, expected by LAPACK
+    PetscInt N_sq = N*N;
+    PetscInt one = 1;
     PetscLogIsActive(&log_active_solve);
     if (log_active_solve){PetscLogEventBegin(ID_solve_memcpy,0,0,0,0);}
     PetscBLASInt info;
     PetscBLASInt *ipiv = N <= BUF_SIZE ? ipiv_buffer : malloc(N*sizeof(*ipiv));
-    memcpy(out,B,N*sizeof(PetscScalar));
-    PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N*N*sizeof(*Awork));
-    memcpy(Awork,A,N*N*sizeof(PetscScalar));
+
+    if (incOut == 1)
+      out_proxy = out;
+    else
+      out_proxy = (N <= BUF_SIZE) ? out_proxy_buffer : malloc(N*sizeof(*out));
+
+    BLAScopy_(&N, B, &incB, out_proxy, &one);
+
+    PetscScalar *Awork = N <= BUF_SIZE ? work_buffer : malloc(N_sq*sizeof(*Awork));
+    BLAScopy_(&N_sq, A, &incA, Awork, &one);
     if (log_active_solve){PetscLogEventEnd(ID_solve_memcpy,0,0,0,0);}
 
     PetscBLASInt NRHS = 1;
@@ -35,7 +54,11 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con
 
     if(info == 0){
         if (log_active_solve){PetscLogEventBegin(ID_solve_getrs,0,0,0,0);}
-        LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out, &N, &info);
+        LAPACKgetrs_(&T, &N, &NRHS, Awork, &N, ipiv, out_proxy, &N, &info);
+
+        if (out != out_proxy)
+            BLAScopy_(&N, out_proxy, &one, out, &incOut);
+
         if (log_active_solve){PetscLogEventEnd(ID_solve_getrs,0,0,0,0);}
     }
 
@@ -44,8 +67,12 @@ void solve(PetscScalar* __restrict__ out, const PetscScalar* __restrict__ A, con
         abort();
     }
 
-    if ( N > BUF_SIZE ) {
-        free(ipiv);
-        free(Awork);
-    }
+    if (ipiv != ipiv_buffer)
+      free(ipiv);
+
+    if (Awork != work_buffer)
+      free(Awork);
+
+    if ((out_proxy != out) && (out_proxy != out_proxy_buffer))
+      free(out_proxy);
 }
diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index c083dd605..dfbd3a656 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -143,6 +143,7 @@ def with_types(self, arg_id_to_dtype, callables_table):
                 callables_table)
 
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
         assert self.is_ready_for_codegen()
         assert isinstance(insn, loopy.CallInstruction)
 
@@ -151,6 +152,9 @@ def emit_call_insn(self, insn, target, expression_to_code_mapper):
         parameters = list(parameters)
         par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
 
+        if expression_to_code_mapper.codegen_state.vectorization_info:
+            raise UnvectorizableError("LACallable: cannot take in vector arrays")
+
         parameters.append(insn.assignees[-1])
         par_dtypes.append(self.arg_id_to_dtype[0])
 
@@ -177,6 +181,46 @@ class INVCallable(LACallable):
     """
     name = "inverse"
 
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        a_descr = arg_id_to_descr.get(0)
+        a_inv_descr = arg_id_to_descr.get(-1)
+
+        if a_descr is None or a_inv_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert len(a_descr.shape) == 2
+        assert a_descr.shape == a_inv_descr.shape
+        assert a_descr.shape[1] == a_descr.shape[0]
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
+
+        # Override codegen to emit stride info. to the blas calls.
+        in_descr = self.arg_id_to_descr[0]
+        out_descr = self.arg_id_to_descr[-1]
+        ecm = expression_to_code_mapper
+
+        # see pyop2/codegen/c/inverse.c for the func. signature
+        inc_a = in_descr.dim_tags[1].stride
+        inc_a_out = out_descr.dim_tags[1].stride
+        n = in_descr.shape[0]
+
+        a, = insn.expression.parameters
+        a_out, = insn.assignees
+
+        if ecm.codegen_state.vectorization_info is not None:
+            raise UnvectorizableError("cannot vectorize 'inverse'.")
+
+        c_parameters = [ecm(a_out).expr,
+                        ecm(a).expr,
+                        n,
+                        inc_a,
+                        inc_a_out]
+        return var(self.name_in_target)(*c_parameters), False
+
     def generate_preambles(self, target):
         assert isinstance(target, type(target))
         yield ("inverse", inverse_preamble)
@@ -189,19 +233,65 @@ class SolveCallable(LACallable):
     """
     name = "solve"
 
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        a_descr = arg_id_to_descr.get(0)
+        b_descr = arg_id_to_descr.get(1)
+        x_descr = arg_id_to_descr.get(-1)
+
+        if a_descr is None or b_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert len(a_descr.shape) == 2
+        assert len(x_descr.shape) == 1
+        assert b_descr.shape == x_descr.shape
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from loopy.codegen import UnvectorizableError
+
+        # Override codegen to emit stride info. to the blas calls.
+        a_descr = self.arg_id_to_descr[0]
+        b_descr = self.arg_id_to_descr[1]
+        out_descr = self.arg_id_to_descr[-1]
+        ecm = expression_to_code_mapper
+
+        # see pyop2/codegen/c/solve.c for the func. signature
+        inc_a = a_descr.dim_tags[1].stride
+        inc_b = b_descr.dim_tags[0].stride
+        inc_out = out_descr.dim_tags[0].stride
+        n = a_descr.shape[0]
+
+        a, b = insn.expression.parameters
+        out, = insn.assignees
+
+        if ecm.codegen_state.vectorization_info is not None:
+            raise UnvectorizableError("cannot vectorize 'inverse'.")
+
+        c_parameters = [ecm(out).expr,
+                        ecm(a).expr,
+                        ecm(b).expr,
+                        n,
+                        inc_a,
+                        inc_b,
+                        inc_out]
+        return var(self.name_in_target)(*c_parameters), False
+
     def generate_preambles(self, target):
         assert isinstance(target, type(target))
         yield ("solve", solve_preamble)
 
 
 class _PreambleGen(ImmutableRecord):
-    fields = set(("preamble", ))
+    fields = {"preamble", "idx"}
 
-    def __init__(self, preamble):
+    def __init__(self, preamble, idx="0"):
         self.preamble = preamble
+        self.idx = idx
 
     def __call__(self, preamble_info):
-        yield ("0", self.preamble)
+        yield (self.idx, self.preamble)
 
 
 class PyOP2KernelCallable(loopy.ScalarCallable):
@@ -537,7 +627,9 @@ def renamer(expr):
                                 options=options,
                                 assumptions=assumptions,
                                 lang_version=(2018, 2),
-                                name=wrapper_name)
+                                name=wrapper_name,
+                                # TODO, should these really be silenced?
+                                silenced_warnings=["write_race*", "data_dep*"])
 
     # prioritize loops
     for indices in context.index_ordering:
diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index ecca43187..dfc2c2662 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -46,6 +46,7 @@
 
 from pyop2.mpi import MPI, collective, COMM_WORLD
 from pyop2.mpi import dup_comm, get_compilation_comm, set_compilation_comm
+from pyop2.caching import cached
 from pyop2.configuration import configuration
 from pyop2.logger import warning, debug, progress, INFO
 from pyop2.exceptions import CompilationError
@@ -458,14 +459,14 @@ class MacClangCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall", "-framework", "Accelerate")
     _ldflags = ("-dynamiclib",)
 
-    _optflags = ("-O3", "-ffast-math", "-march=native")
+    _optflags = ("-O3", "-ffast-math", "-march=native", "-fopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
 class MacClangARMCompiler(MacClangCompiler):
     """A compiler for building a shared library on ARM based Mac systems."""
     # See https://stackoverflow.com/q/65966969
-    _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14")
+    _optflags = ("-O3", "-ffast-math", "-mcpu=apple-a14", "-fopenmp-simd")
     # Need to pass -L/opt/homebrew/opt/gcc/lib/gcc/11 to prevent linker error:
     # ld: file not found: @rpath/libgcc_s.1.1.dylib for architecture arm64 This
     # seems to be a homebrew configuration issue somewhere. Hopefully this
@@ -486,7 +487,7 @@ class LinuxGnuCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall")
     _ldflags = ("-shared",)
 
-    _optflags = ("-march=native", "-O3", "-ffast-math")
+    _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp")
     _debugflags = ("-O0", "-g")
 
     def sniff_compiler_version(self, cpp=False):
@@ -543,7 +544,7 @@ class LinuxClangCompiler(Compiler):
     _cxxflags = ("-fPIC", "-Wall")
     _ldflags = ("-shared", "-L/usr/lib")
 
-    _optflags = ("-march=native", "-O3", "-ffast-math")
+    _optflags = ("-march=native", "-O3", "-ffast-math", "-fopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
@@ -558,7 +559,7 @@ class LinuxIntelCompiler(Compiler):
     _cxxflags = ("-fPIC", "-no-multibyte-chars")
     _ldflags = ("-shared",)
 
-    _optflags = ("-Ofast", "-xHost")
+    _optflags = ("-Ofast", "-xHost", "-qopenmp-simd")
     _debugflags = ("-O0", "-g")
 
 
@@ -696,3 +697,23 @@ def clear_cache(prompt=False):
         shutil.rmtree(cachedir)
     else:
         print("Not removing cached libraries")
+
+
+@cached(cache={})
+def max_simd_width():
+    prg_str = '''#include <stdio.h>
+
+int get_simd_width(){
+    return __builtin_cpu_supports("avx512f") ? 8:
+        __builtin_cpu_supports("avx") ? 4:
+        __builtin_cpu_supports("sse") ? 2:
+        1;
+}
+'''
+    try:
+        simd_width = load(prg_str, "c", "get_simd_width", restype=ctypes.c_int)
+        width = simd_width()
+    except (OSError, CompilationError):
+        warning("Cannot sniff SIMD width, using default of 4 doubles")
+        width = 4
+    return width
diff --git a/pyop2/configuration.py b/pyop2/configuration.py
index 29717718c..b6c8dc9cd 100644
--- a/pyop2/configuration.py
+++ b/pyop2/configuration.py
@@ -74,6 +74,13 @@ class Configuration(dict):
         cdim > 1 be built as block sparsities, or dof sparsities.  The
         former saves memory but changes which preconditioners are
         available for the resulting matrices.  (Default yes)
+    :param vectorization_strategy: A :class:`str` describing the
+        vectorization strategy that must to be applied to the kernels. Can
+        be one of the following --
+        - ``cross-element``: Cross-element vectorization strategy of
+          `<https://doi.org/10.1177/1094342020945005>`__.
+    :param alignment: A :class:`int` which specifies a size to which all temporaries
+        are aligned in memory.
     """
     # name, env variable, type, default, write once
     cache_dir = os.path.join(gettempdir(), "pyop2-cache-uid%s" % os.getuid())
@@ -92,6 +99,12 @@ class Configuration(dict):
             ("PYOP2_LDFLAGS", str, ""),
         "simd_width":
             ("PYOP2_SIMD_WIDTH", int, 4),
+        "extra_info":
+            ("PYOP2_EXTRA_INFO", bool, False),
+        "vectorization_strategy":
+            ("PYOP2_VECT_STRATEGY", str, "cross-element"),
+        "alignment":
+            ("PYOP2_ALIGNMENT", int, 64),
         "debug":
             ("PYOP2_DEBUG", bool, False),
         "compute_kernel_flops":
diff --git a/pyop2/global_kernel.py b/pyop2/global_kernel.py
index 6ef49dfa6..776ddd8f4 100644
--- a/pyop2/global_kernel.py
+++ b/pyop2/global_kernel.py
@@ -15,6 +15,7 @@
 from pyop2.datatypes import IntType, as_ctypes
 from pyop2.types import IterationRegion
 from pyop2.utils import cached_property, get_petsc_dir
+from pyop2 import op2
 
 
 # We set eq=False to force identity-based hashing. This is required for when
@@ -211,7 +212,7 @@ def __len__(self):
 
     @property
     def cache_key(self):
-        return tuple(a.cache_key for a in self.arguments)
+        return tuple(a.cache_key for a in self.arguments) + tuple(configuration["vectorization_strategy"])
 
     @property
     def maps(self):
@@ -252,7 +253,8 @@ class GlobalKernel(Cached):
     @classmethod
     def _cache_key(cls, local_knl, arguments, **kwargs):
         key = [cls, local_knl.cache_key,
-               *kwargs.items(), configuration["simd_width"]]
+               *kwargs.items(), configuration["simd_width"],
+               configuration["vectorization_strategy"]]
 
         key.extend([a.cache_key for a in arguments])
 
@@ -345,8 +347,62 @@ def builder(self):
     def code_to_compile(self):
         """Return the C/C++ source code as a string."""
         from pyop2.codegen.rep2loopy import generate
+        from loopy.symbolic import get_dependencies
+        from functools import reduce
 
         wrapper = generate(self.builder)
+        if self._extruded:
+            iname = "layer"
+        else:
+            iname = "n"
+
+        # TODO: vectorizing 2-form assembly kernels is possible, but must
+        # change the arguments passed to MatSetValuesxxx (not yet implemented)
+        has_matrix = any(isinstance(arg, (MatKernelArg, MixedMatKernelArg))
+                         for arg in self.arguments)
+        has_rw = any(arg.access == op2.RW for arg in self.local_kernel.arguments)
+        is_cplx = (any(lp.types.NumpyType(dtype).is_complex()
+                       if not isinstance(dtype, lp.types.NumpyType)
+                       else dtype.is_complex()
+                       for dtype in self.local_kernel.dtypes
+                       )  # local args complex?
+                   or any(arg.dtype.is_complex()
+                          for arg in tuple(wrapper.default_entrypoint.temporary_variables.values())))  # global temps complex?
+        extruded_coords = self.local_kernel.name.endswith("extrusion")  # FIXME is there a better way to know that this kernel generated the extrusion coords?
+        is_loopy_kernel = isinstance(self.local_kernel.code, lp.TranslationUnit)
+        vectorisable = is_loopy_kernel and ((not (has_matrix or has_rw)) and (configuration["vectorization_strategy"])) and not is_cplx and not extruded_coords
+
+        if vectorisable:
+            # change target to generate vectorized code via gcc vector
+            # extensions
+            wrapper = wrapper.copy(target=lp.CVectorExtensionsTarget(
+                vec_fallback=lp.VectorizationFallback.OMP_SIMD
+            ))
+            # inline all inner kernels
+            names = self.local_kernel.code.callables_table
+            for name in names:
+                if (name in wrapper.callables_table.keys()
+                    and isinstance(wrapper.callables_table[name],
+                                   lp.CallableKernel)):
+                    wrapper = lp.inline_callable_kernel(wrapper, name)
+
+            all_insn_preds = reduce(
+                frozenset.union,
+                (insn.predicates
+                    for insn in wrapper.default_entrypoint.instructions),
+                frozenset())
+
+            if iname not in get_dependencies(tuple(all_insn_preds)):
+                # https://github.com/inducer/loopy/issues/615
+                # TODO: get rid of this guard once the loopy issue is fixed
+                if configuration["vectorization_strategy"] == "cross-element":
+                    wrapper = self.vectorise(wrapper, iname,
+                                             configuration["simd_width"])
+                else:
+                    raise NotImplementedError(
+                        "Vectorization strategy"
+                        f" '{configuration['vectorization_strategy']}'")
+
         code = lp.generate_code_v2(wrapper)
 
         if self.local_kernel.cpp:
@@ -354,8 +410,135 @@ def code_to_compile(self):
             preamble = "".join(process_preambles(getattr(code, "device_preambles", [])))
             device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
             return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
+
         return code.device_code()
 
+    def vectorise(self, wrapper, iname, batch_size):
+        """Return a vectorised version of wrapper, vectorising over iname.
+
+        :arg wrapper: A loopy kernel to vectorise.
+        :arg iname: The iteration index to vectorise over.
+        :arg batch_size: The vector width."""
+        if batch_size == 1:
+            return wrapper
+
+        from functools import reduce
+        import pymbolic.primitives as prim
+
+        if not configuration["debug"]:
+            # loopy warns for every instruction that cannot be vectorized;
+            # ignore in non-debug mode.
+            new_entrypoint = wrapper.default_entrypoint.copy(
+                silenced_warnings=(wrapper.default_entrypoint.silenced_warnings
+                                   + ["vectorize_failed"]))
+            wrapper = wrapper.with_kernel(new_entrypoint)
+
+        kernel = wrapper.default_entrypoint
+
+        # {{{ get rid of noop insns
+
+        from loopy.match import Id, Or
+
+        noop_insn_ids = [Id(insn.id)
+                         for insn in kernel.instructions
+                         if isinstance(insn, lp.NoOpInstruction)]
+        kernel = lp.remove_instructions(kernel, Or(tuple(noop_insn_ids)))
+
+        # }}}
+
+        # align temps
+        alignment = configuration["alignment"]
+        tmps = {name: tv.copy(alignment=alignment)
+                for name, tv in kernel.temporary_variables.items()}
+        kernel = kernel.copy(temporary_variables=tmps)
+
+        # {{{ record temps that cannot be vectorized
+
+        # Do not vectorize temporaries used outside *iname*
+        temps_not_to_vectorize = reduce(set.union,
+                                        [(insn.dependency_names()
+                                          & frozenset(kernel.temporary_variables))
+                                         for insn in kernel.instructions
+                                         if iname not in insn.within_inames],
+                                        set())
+
+        # Constant literal temporaries are arguments => cannot vectorize
+        temps_not_to_vectorize |= {name
+                                   for name, tv in kernel.temporary_variables.items()
+                                   if (tv.read_only
+                                       and tv.initializer is not None)}
+
+        temps_not_to_vectorize |= {name
+                                   for name, tv in kernel.temporary_variables.items()
+                                   if kernel.writer_map().get(tv.name, set()) | kernel.reader_map().get(tv.name, set()) == set()}
+
+        # {{{ clang (unlike gcc) does not allow taking address of vector-type
+        # variable
+
+        # FIXME: Perform this only if we know we are not using gcc.
+        for insn in kernel.instructions:
+            if (
+                    isinstance(insn, lp.MultiAssignmentBase)
+                    and isinstance(insn.expression, prim.Call)
+                    and insn.expression.function.name in ["solve", "inverse"]):
+                temps_not_to_vectorize |= (insn.dependency_names())
+
+        # }}}
+
+        # }}}
+
+        # {{{ TODO: placeholder until loopy's simplify_using_pwaff gets smarter
+
+        # transform to ensure that the loop undergoing array expansion has a
+        # lower bound of '0'
+        from loopy.symbolic import pw_aff_to_expr
+        lbound = pw_aff_to_expr(kernel.get_iname_bounds(iname).lower_bound_pw_aff)
+        shifted_iname = kernel.get_var_name_generator()(f"{iname}_shift")
+        kernel = lp.affine_map_inames(kernel, iname, shifted_iname,
+                                      [(prim.Variable(shifted_iname),
+                                       (prim.Variable(iname) - lbound))])
+
+        # }}}
+
+        # split iname
+        # note there is no front slab needed because iname is shifted (see above)
+        slabs = (0, 1)
+        inner_iname = kernel.get_var_name_generator()(f"{shifted_iname}_batch")
+
+        kernel = lp.split_iname(kernel, shifted_iname, batch_size, slabs=slabs,
+                                inner_iname=inner_iname)
+
+        # adds a new axis to the temporary and indexes it with the provided iname
+        # i.e. stores the value at each instance of the loop. (i.e. array
+        # expansion)
+        kernel = lp.privatize_temporaries_with_inames(kernel, inner_iname)
+
+        # tag axes of the temporaries as vectorised
+        for name, tmp in kernel.temporary_variables.items():
+            if name not in temps_not_to_vectorize:
+                tag = (len(tmp.shape)-1)*"c," + "vec"
+                kernel = lp.tag_array_axes(kernel, name, tag)
+
+        # tag the inner iname as vectorized
+        kernel = lp.tag_inames(kernel, {inner_iname: "vec"})
+
+        # {{{ duplicate the inames surrounding the CInstructions with predicates
+
+        cinsn_ids = [cinsn.id
+                     for cinsn in kernel.instructions
+                     if (isinstance(cinsn, lp.CInstruction) and cinsn.predicates)]
+
+        for cinsn_id in cinsn_ids:
+            kernel = lp.duplicate_inames(kernel,
+                                         (inner_iname,),
+                                         within=f"id:{cinsn_id}",
+                                         tags={inner_iname: "unr"}
+                                         )
+
+        # }}}
+
+        return wrapper.with_kernel(kernel)
+
     @PETSc.Log.EventDecorator()
     @mpi.collective
     def compile(self, comm):
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index 2863ab88f..acd7b7f18 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -190,6 +190,10 @@ def _compute(self, part):
 
         :arg part: The :class:`SetPartition` to compute over.
         """
+        if configuration["extra_info"]:
+            nbytes = self.comm.allreduce(self.nbytes)
+            if self.comm.Get_rank() == 0:
+                print("{0}_BYTES= {1}".format(self.global_kernel.name, nbytes))
         with self._compute_event():
             PETSc.Log.logFlops(part.size*self.num_flops)
             self.global_kernel(self.comm, part.offset, part.offset+part.size, *self.arglist)
@@ -198,6 +202,25 @@ def _compute(self, part):
     def num_flops(self):
         return self.global_kernel.num_flops(self.iterset)
 
+    @cached_property
+    def nbytes(self):
+        nbytes = 0
+        seen = set()
+        for lk_arg, gk_arg, pl_arg in self.zipped_arguments:
+            if lk_arg.access == Access.INC:
+                nbytes += pl_arg.data.nbytes * 2
+            else:
+                nbytes += pl_arg.data.nbytes
+            for map_ in pl_arg.maps:
+                if map_ is None:
+                    continue
+                for k in map_._kernel_args_:
+                    if k in seen:
+                        continue
+                    nbytes += map_.values.nbytes
+                    seen.add(k)
+        return nbytes
+
     @mpi.collective
     def compute(self):
         # Parloop.compute is an alias for Parloop.__call__
diff --git a/requirements-git.txt b/requirements-git.txt
index 438205043..0e5dce623 100644
--- a/requirements-git.txt
+++ b/requirements-git.txt
@@ -1,2 +1,2 @@
 git+https://github.com/coneoproject/COFFEE.git#egg=coffee
-git+https://github.com/firedrakeproject/loopy.git@main#egg=loopy
+git+https://github.com/firedrakeproject/loopy.git@c_vecextensions_target_241022#egg=loopy
diff --git a/test/unit/test_vectorisation.py b/test/unit/test_vectorisation.py
new file mode 100644
index 000000000..7425a9eae
--- /dev/null
+++ b/test/unit/test_vectorisation.py
@@ -0,0 +1,98 @@
+# This file is part of PyOP2
+#
+# PyOP2 is Copyright (c) 2012, Imperial College London and
+# others. Please see the AUTHORS file in the main source directory for
+# a full list of copyright holders.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in the
+#       documentation and/or other materials provided with the distribution.
+#     * The name of Imperial College London or that of other
+#       contributors may not be used to endorse or promote products
+#       derived from this software without specific prior written
+#       permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
+# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+# OF THE POSSIBILITY OF SUCH DAMAGE.
+import numpy as np
+from pyop2 import op2
+from pyop2.configuration import configuration
+import os
+import pytest
+from pyop2.parloop import LegacyParloop
+from pyop2.types.glob import Global
+from pyop2.types import Access
+
+
+some_vectorisation_keys = ["__attribute__", "vector_size", "aligned", "#pragma omp simd"]
+
+
+class TestVectorisation:
+
+    @pytest.fixture
+    def s(self):
+        return op2.Set(1)
+
+    @pytest.fixture
+    def d1(self, s):
+        return op2.Dat(s, [3], np.float64)
+
+    @pytest.fixture
+    def d(self, s):
+        return op2.Dat(s, [4], np.float64)
+
+    def inner(self, s, o):
+        s._check_shape(o)
+        ret = Global(1, data=0, dtype=s.dtype)
+        inner_parloop = LegacyParloop(s._inner_kernel(o.dtype), s.dataset.set,
+                                      s(Access.READ), o(Access.READ), ret(Access.INC))
+        inner_parloop.compute()
+        return (inner_parloop.global_kernel, ret.data_ro[0])
+
+    def test_vectorisation(self, d1, d):
+        # Test that vectorised code produced the correct result
+        kernel1, ret = self.inner(d, d1)
+        assert abs(ret - 12) < 1e-12
+        kernel2, ret = self.inner(d1, d)
+        assert abs(ret - 12) < 1e-12
+
+        # Test that we actually vectorised
+        assert all(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has not been succesfully vectorised."
+        assert all(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has not been succesfully vectorised."
+
+    def test_no_vectorisation(self, d1, d):
+        # turn vectorisation off
+        configuration.reconfigure(vectorization_strategy="")
+
+        # Test that unvectorised code produced the correct result
+        kernel1, ret = self.inner(d, d1)
+        assert abs(ret - 12) < 1e-12
+        kernel2, ret = self.inner(d1, d)
+        assert abs(ret - 12) < 1e-12
+
+        # Test that we did not vectorise
+        assert not any(key in kernel1.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d, d) has been vectorised even though we turned it off."
+        assert not any(key in kernel2.code_to_compile for key in some_vectorisation_keys), "The kernel for an inner(d1, d) has been vectorised even though we turned it off."
+
+        # change vect config back to be turned on by default
+        configuration.reconfigure(vectorization_strategy="cross-element")
+
+
+if __name__ == '__main__':
+    pytest.main(os.path.abspath(__file__))