From fb0ee93eb5e1031c755afdd76c54e114131c89ea Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 11:37:23 +0100
Subject: [PATCH 1/6] Split apart and linting passes

---
 pyop2/base.py                         | 3911 -------------------------
 pyop2/kernel.py                       |  130 +
 pyop2/parloop.py                      |  884 ++++++
 pyop2/sequential.py                   |  251 --
 pyop2/types/__init__.py               |    9 +
 pyop2/types/access.py                 |   37 +
 pyop2/types/dat.py                    | 1023 +++++++
 pyop2/types/data_carrier.py           |  109 +
 pyop2/types/dataset.py                |  531 ++++
 pyop2/types/glob.py                   |  290 ++
 pyop2/types/halo.py                   |   56 +
 pyop2/types/map.py                    |  305 ++
 pyop2/{petsc_base.py => types/mat.py} | 1101 +++----
 pyop2/types/set.py                    |  626 ++++
 14 files changed, 4606 insertions(+), 4657 deletions(-)
 delete mode 100644 pyop2/base.py
 create mode 100644 pyop2/kernel.py
 create mode 100644 pyop2/parloop.py
 delete mode 100644 pyop2/sequential.py
 create mode 100644 pyop2/types/__init__.py
 create mode 100644 pyop2/types/access.py
 create mode 100644 pyop2/types/dat.py
 create mode 100644 pyop2/types/data_carrier.py
 create mode 100644 pyop2/types/dataset.py
 create mode 100644 pyop2/types/glob.py
 create mode 100644 pyop2/types/halo.py
 create mode 100644 pyop2/types/map.py
 rename pyop2/{petsc_base.py => types/mat.py} (51%)
 create mode 100644 pyop2/types/set.py

diff --git a/pyop2/base.py b/pyop2/base.py
deleted file mode 100644
index ba4929390..000000000
--- a/pyop2/base.py
+++ /dev/null
@@ -1,3911 +0,0 @@
-# This file is part of PyOP2
-#
-# PyOP2 is Copyright (c) 2012, Imperial College London and
-# others. Please see the AUTHORS file in the main source directory for
-# a full list of copyright holders.  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * The name of Imperial College London or that of other
-#       contributors may not be used to endorse or promote products
-#       derived from this software without specific prior written
-#       permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
-# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-# OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""Base classes for OP2 objects, containing metadata and runtime data
-information which is backend independent. Individual runtime backends should
-subclass these as required to implement backend-specific features.
-"""
-import abc
-
-from enum import IntEnum
-from collections import defaultdict
-import itertools
-import numpy as np
-import ctypes
-import numbers
-import operator
-import types
-from hashlib import md5
-
-from pyop2.datatypes import IntType, as_cstr, dtype_limits, ScalarType
-from pyop2.configuration import configuration
-from pyop2.caching import Cached, ObjectCached
-from pyop2.exceptions import *
-from pyop2.utils import *
-from pyop2.mpi import MPI, collective, dup_comm
-from pyop2.profiling import timed_region
-from pyop2.sparsity import build_sparsity
-from pyop2.version import __version__ as version
-
-from coffee.base import Node
-from coffee.visitors import EstimateFlops
-from functools import reduce
-
-import loopy
-
-
-def _make_object(name, *args, **kwargs):
-    from pyop2 import sequential
-    return getattr(sequential, name)(*args, **kwargs)
-
-
-# Data API
-
-class Access(IntEnum):
-    READ = 1
-    WRITE = 2
-    RW = 3
-    INC = 4
-    MIN = 5
-    MAX = 6
-
-
-READ = Access.READ
-"""The :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed read-only."""
-
-WRITE = Access.WRITE
-"""The  :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed write-only,
-and OP2 is not required to handle write conflicts."""
-
-RW = Access.RW
-"""The  :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed for reading
-and writing, and OP2 is not required to handle write conflicts."""
-
-INC = Access.INC
-"""The kernel computes increments to be summed onto a :class:`Global`,
-:class:`Dat`, or :class:`Mat`. OP2 is responsible for managing the write
-conflicts caused."""
-
-MIN = Access.MIN
-"""The kernel contributes to a reduction into a :class:`Global` using a ``min``
-operation. OP2 is responsible for reducing over the different kernel
-invocations."""
-
-MAX = Access.MAX
-"""The kernel contributes to a reduction into a :class:`Global` using a ``max``
-operation. OP2 is responsible for reducing over the different kernel
-invocations."""
-
-# Data API
-
-
-class Arg(object):
-
-    """An argument to a :func:`pyop2.op2.par_loop`.
-
-    .. warning ::
-        User code should not directly instantiate :class:`Arg`.
-        Instead, use the call syntax on the :class:`DataCarrier`.
-    """
-
-    def __init__(self, data=None, map=None, access=None, lgmaps=None, unroll_map=False):
-        """
-        :param data: A data-carrying object, either :class:`Dat` or class:`Mat`
-        :param map:  A :class:`Map` to access this :class:`Arg` or the default
-                     if the identity map is to be used.
-        :param access: An access descriptor of type :class:`Access`
-        :param lgmaps: For :class:`Mat` objects, a tuple of 2-tuples of local to
-            global maps used during assembly.
-
-        Checks that:
-
-        1. the maps used are initialized i.e. have mapping data associated, and
-        2. the to Set of the map used to access it matches the Set it is
-           defined on.
-
-        A :class:`MapValueError` is raised if these conditions are not met."""
-        self.data = data
-        self._map = map
-        if map is None:
-            self.map_tuple = ()
-        elif isinstance(map, Map):
-            self.map_tuple = (map, )
-        else:
-            self.map_tuple = tuple(map)
-
-        if data is not None and hasattr(data, "dtype"):
-            if data.dtype.kind == "c" and (access == MIN or access == MAX):
-                raise ValueError("MIN and MAX access descriptors are undefined on complex data.")
-        self._access = access
-
-        self.unroll_map = unroll_map
-        self.lgmaps = None
-        if self._is_mat and lgmaps is not None:
-            self.lgmaps = as_tuple(lgmaps)
-            assert len(self.lgmaps) == self.data.nblocks
-        else:
-            if lgmaps is not None:
-                raise ValueError("Local to global maps only for matrices")
-
-        # Check arguments for consistency
-        if configuration["type_check"] and not (self._is_global or map is None):
-            for j, m in enumerate(map):
-                if m.iterset.total_size > 0 and len(m.values_with_halo) == 0:
-                    raise MapValueError("%s is not initialized." % map)
-                if self._is_mat and m.toset != data.sparsity.dsets[j].set:
-                    raise MapValueError(
-                        "To set of %s doesn't match the set of %s." % (map, data))
-            if self._is_dat and map.toset != data.dataset.set:
-                raise MapValueError(
-                    "To set of %s doesn't match the set of %s." % (map, data))
-
-    def recreate(self, data=None, map=None, access=None, lgmaps=None, unroll_map=None):
-        """Creates a new Dat based on the existing Dat with the changes specified.
-
-        :param data: A data-carrying object, either :class:`Dat` or class:`Mat`
-        :param map:  A :class:`Map` to access this :class:`Arg` or the default
-                     if the identity map is to be used.
-        :param access: An access descriptor of type :class:`Access`
-        :param lgmaps: For :class:`Mat` objects, a tuple of 2-tuples of local to
-            global maps used during assembly."""
-        return type(self)(data=data or self.data,
-                          map=map or self.map,
-                          access=access or self.access,
-                          lgmaps=lgmaps or self.lgmaps,
-                          unroll_map=False if unroll_map is None else unroll_map)
-
-    @cached_property
-    def _kernel_args_(self):
-        return self.data._kernel_args_
-
-    @cached_property
-    def _argtypes_(self):
-        return self.data._argtypes_
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        if self.map is not None:
-            map_ = tuple(None if m is None else m._wrapper_cache_key_ for m in self.map)
-        else:
-            map_ = self.map
-        return (type(self), self.access, self.data._wrapper_cache_key_, map_, self.unroll_map)
-
-    @property
-    def _key(self):
-        return (self.data, self._map, self._access)
-
-    def __eq__(self, other):
-        r""":class:`Arg`\s compare equal of they are defined on the same data,
-        use the same :class:`Map` with the same index and the same access
-        descriptor."""
-        return self._key == other._key
-
-    def __ne__(self, other):
-        r""":class:`Arg`\s compare equal of they are defined on the same data,
-        use the same :class:`Map` with the same index and the same access
-        descriptor."""
-        return not self.__eq__(other)
-
-    def __str__(self):
-        return "OP2 Arg: dat %s, map %s, access %s" % \
-            (self.data, self._map, self._access)
-
-    def __repr__(self):
-        return "Arg(%r, %r, %r)" % \
-            (self.data, self._map, self._access)
-
-    def __iter__(self):
-        for arg in self.split:
-            yield arg
-
-    @cached_property
-    def split(self):
-        """Split a mixed argument into a tuple of constituent arguments."""
-        if self._is_mixed_dat:
-            return tuple(_make_object('Arg', d, m, self._access)
-                         for d, m in zip(self.data, self._map))
-        elif self._is_mixed_mat:
-            rows, cols = self.data.sparsity.shape
-            mr, mc = self.map
-            return tuple(_make_object('Arg', self.data[i, j], (mr.split[i], mc.split[j]),
-                                      self._access)
-                         for i in range(rows) for j in range(cols))
-        else:
-            return (self,)
-
-    @cached_property
-    def name(self):
-        """The generated argument name."""
-        return "arg%d" % self.position
-
-    @cached_property
-    def ctype(self):
-        """String representing the C type of the data in this ``Arg``."""
-        return self.data.ctype
-
-    @cached_property
-    def dtype(self):
-        """Numpy datatype of this Arg"""
-        return self.data.dtype
-
-    @cached_property
-    def map(self):
-        """The :class:`Map` via which the data is to be accessed."""
-        return self._map
-
-    @cached_property
-    def access(self):
-        """Access descriptor. One of the constants of type :class:`Access`"""
-        return self._access
-
-    @cached_property
-    def _is_dat_view(self):
-        return isinstance(self.data, DatView)
-
-    @cached_property
-    def _is_mat(self):
-        return isinstance(self.data, Mat)
-
-    @cached_property
-    def _is_mixed_mat(self):
-        return self._is_mat and self.data.sparsity.shape > (1, 1)
-
-    @cached_property
-    def _is_global(self):
-        return isinstance(self.data, Global)
-
-    @cached_property
-    def _is_global_reduction(self):
-        return self._is_global and self._access in [INC, MIN, MAX]
-
-    @cached_property
-    def _is_dat(self):
-        return isinstance(self.data, Dat)
-
-    @cached_property
-    def _is_mixed_dat(self):
-        return isinstance(self.data, MixedDat)
-
-    @cached_property
-    def _is_mixed(self):
-        return self._is_mixed_dat or self._is_mixed_mat
-
-    @cached_property
-    def _is_direct(self):
-        return isinstance(self.data, Dat) and self.map is None
-
-    @cached_property
-    def _is_indirect(self):
-        return isinstance(self.data, Dat) and self.map is not None
-
-    @collective
-    def global_to_local_begin(self):
-        """Begin halo exchange for the argument if a halo update is required.
-        Doing halo exchanges only makes sense for :class:`Dat` objects.
-        """
-        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
-        if self._is_direct:
-            return
-        if self.access is not WRITE:
-            self.data.global_to_local_begin(self.access)
-
-    @collective
-    def global_to_local_end(self):
-        """Finish halo exchange for the argument if a halo update is required.
-        Doing halo exchanges only makes sense for :class:`Dat` objects.
-        """
-        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
-        if self._is_direct:
-            return
-        if self.access is not WRITE:
-            self.data.global_to_local_end(self.access)
-
-    @collective
-    def local_to_global_begin(self):
-        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
-        if self._is_direct:
-            return
-        if self.access in {INC, MIN, MAX}:
-            self.data.local_to_global_begin(self.access)
-
-    @collective
-    def local_to_global_end(self):
-        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
-        if self._is_direct:
-            return
-        if self.access in {INC, MIN, MAX}:
-            self.data.local_to_global_end(self.access)
-
-    @collective
-    def reduction_begin(self, comm):
-        """Begin reduction for the argument if its access is INC, MIN, or MAX.
-        Doing a reduction only makes sense for :class:`Global` objects."""
-        assert self._is_global, \
-            "Doing global reduction only makes sense for Globals"
-        if self.access is not READ:
-            if self.access is INC:
-                op = MPI.SUM
-            elif self.access is MIN:
-                op = MPI.MIN
-            elif self.access is MAX:
-                op = MPI.MAX
-            if MPI.VERSION >= 3:
-                self._reduction_req = comm.Iallreduce(self.data._data, self.data._buf, op=op)
-            else:
-                comm.Allreduce(self.data._data, self.data._buf, op=op)
-
-    @collective
-    def reduction_end(self, comm):
-        """End reduction for the argument if it is in flight.
-        Doing a reduction only makes sense for :class:`Global` objects."""
-        assert self._is_global, \
-            "Doing global reduction only makes sense for Globals"
-        if self.access is not READ:
-            if MPI.VERSION >= 3:
-                self._reduction_req.Wait()
-                self._reduction_req = None
-            self.data._data[:] = self.data._buf[:]
-
-
-class Set(object):
-
-    """OP2 set.
-
-    :param size: The size of the set.
-    :type size: integer or list of four integers.
-    :param string name: The name of the set (optional).
-    :param halo: An exisiting halo to use (optional).
-
-    When the set is employed as an iteration space in a
-    :func:`pyop2.op2.par_loop`, the extent of any local iteration space within
-    each set entry is indicated in brackets. See the example in
-    :func:`pyop2.op2.par_loop` for more details.
-
-    The size of the set can either be an integer, or a list of four
-    integers.  The latter case is used for running in parallel where
-    we distinguish between:
-
-      - `CORE` (owned and not touching halo)
-      - `OWNED` (owned, touching halo)
-      - `EXECUTE HALO` (not owned, but executed over redundantly)
-      - `NON EXECUTE HALO` (not owned, read when executing in the execute halo)
-
-    If a single integer is passed, we assume that we're running in
-    serial and there is no distinction.
-
-    The division of set elements is: ::
-
-        [0, CORE)
-        [CORE, OWNED)
-        [OWNED, GHOST)
-
-    Halo send/receive data is stored on sets in a :class:`Halo`.
-    """
-
-    _CORE_SIZE = 0
-    _OWNED_SIZE = 1
-    _GHOST_SIZE = 2
-
-    _extruded = False
-
-    _kernel_args_ = ()
-    _argtypes_ = ()
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), )
-
-    @validate_type(('size', (numbers.Integral, tuple, list, np.ndarray), SizeTypeError),
-                   ('name', str, NameTypeError))
-    def __init__(self, size, name=None, halo=None, comm=None):
-        self.comm = dup_comm(comm)
-        if isinstance(size, numbers.Integral):
-            size = [size] * 3
-        size = as_tuple(size, numbers.Integral, 3)
-        assert size[Set._CORE_SIZE] <= size[Set._OWNED_SIZE] <= \
-            size[Set._GHOST_SIZE], "Set received invalid sizes: %s" % size
-        self._sizes = size
-        self._name = name or "set_#x%x" % id(self)
-        self._halo = halo
-        self._partition_size = 1024
-        # A cache of objects built on top of this set
-        self._cache = {}
-
-    @cached_property
-    def core_size(self):
-        """Core set size.  Owned elements not touching halo elements."""
-        return self._sizes[Set._CORE_SIZE]
-
-    @cached_property
-    def size(self):
-        """Set size, owned elements."""
-        return self._sizes[Set._OWNED_SIZE]
-
-    @cached_property
-    def total_size(self):
-        """Set size including ghost elements.
-        """
-        return self._sizes[Set._GHOST_SIZE]
-
-    @cached_property
-    def sizes(self):
-        """Set sizes: core, owned, execute halo, total."""
-        return self._sizes
-
-    @cached_property
-    def core_part(self):
-        return SetPartition(self, 0, self.core_size)
-
-    @cached_property
-    def owned_part(self):
-        return SetPartition(self, self.core_size, self.size - self.core_size)
-
-    @cached_property
-    def name(self):
-        """User-defined label"""
-        return self._name
-
-    @cached_property
-    def halo(self):
-        """:class:`Halo` associated with this Set"""
-        return self._halo
-
-    @property
-    def partition_size(self):
-        """Default partition size"""
-        return self._partition_size
-
-    @partition_size.setter
-    def partition_size(self, partition_value):
-        """Set the partition size"""
-        self._partition_size = partition_value
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __getitem__(self, idx):
-        """Allow indexing to return self"""
-        assert idx == 0
-        return self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __str__(self):
-        return "OP2 Set: %s with size %s" % (self._name, self.size)
-
-    def __repr__(self):
-        return "Set(%r, %r)" % (self._sizes, self._name)
-
-    def __call__(self, *indices):
-        """Build a :class:`Subset` from this :class:`Set`
-
-        :arg indices: The elements of this :class:`Set` from which the
-                      :class:`Subset` should be formed.
-
-        """
-        if len(indices) == 1:
-            indices = indices[0]
-            if np.isscalar(indices):
-                indices = [indices]
-        return _make_object('Subset', self, indices)
-
-    def __contains__(self, dset):
-        """Indicate whether a given DataSet is compatible with this Set."""
-        if isinstance(dset, DataSet):
-            return dset.set is self
-        else:
-            return False
-
-    def __pow__(self, e):
-        """Derive a :class:`DataSet` with dimension ``e``"""
-        return _make_object('DataSet', self, dim=e)
-
-    @cached_property
-    def layers(self):
-        """Return None (not an :class:`ExtrudedSet`)."""
-        return None
-
-    def _check_operands(self, other):
-        if type(other) is Set:
-            if other is not self:
-                raise ValueError("Uable to perform set operations between two unrelated sets: %s and %s." % (self, other))
-        elif type(other) is Subset:
-            if self is not other._superset:
-                raise TypeError("Superset mismatch: self (%s) != other._superset (%s)" % (self, other._superset))
-        else:
-            raise TypeError("Unable to perform set operations between `Set` and %s." % (type(other), ))
-
-    def intersection(self, other):
-        self._check_operands(other)
-        return other
-
-    def union(self, other):
-        self._check_operands(other)
-        return self
-
-    def difference(self, other):
-        self._check_operands(other)
-        if other is self:
-            return Subset(self, [])
-        else:
-            return type(other)(self, np.setdiff1d(np.asarray(range(self.total_size), dtype=IntType), other._indices))
-
-    def symmetric_difference(self, other):
-        self._check_operands(other)
-        return self.difference(other)
-
-
-class GlobalSet(Set):
-
-    _extruded = False
-
-    """A proxy set allowing a :class:`Global` to be used in place of a
-    :class:`Dat` where appropriate."""
-
-    _kernel_args_ = ()
-    _argtypes_ = ()
-
-    def __init__(self, comm=None):
-        self.comm = dup_comm(comm)
-        self._cache = {}
-
-    @cached_property
-    def core_size(self):
-        return 0
-
-    @cached_property
-    def size(self):
-        return 1 if self.comm.rank == 0 else 0
-
-    @cached_property
-    def total_size(self):
-        """Total set size, including halo elements."""
-        return 1 if self.comm.rank == 0 else 0
-
-    @cached_property
-    def sizes(self):
-        """Set sizes: core, owned, execute halo, total."""
-        return (self.core_size, self.size, self.total_size)
-
-    @cached_property
-    def name(self):
-        """User-defined label"""
-        return "GlobalSet"
-
-    @cached_property
-    def halo(self):
-        """:class:`Halo` associated with this Set"""
-        return None
-
-    @property
-    def partition_size(self):
-        """Default partition size"""
-        return None
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __getitem__(self, idx):
-        """Allow indexing to return self"""
-        assert idx == 0
-        return self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __str__(self):
-        return "OP2 GlobalSet"
-
-    def __repr__(self):
-        return "GlobalSet()"
-
-    def __eq__(self, other):
-        # Currently all GlobalSets compare equal.
-        return isinstance(other, GlobalSet)
-
-    def __hash__(self):
-        # Currently all GlobalSets compare equal.
-        return hash(type(self))
-
-
-class ExtrudedSet(Set):
-
-    """OP2 ExtrudedSet.
-
-    :param parent: The parent :class:`Set` to build this :class:`ExtrudedSet` on top of
-    :type parent: a :class:`Set`.
-    :param layers: The number of layers in this :class:`ExtrudedSet`.
-    :type layers: an integer, indicating the number of layers for every entity,
-        or an array of shape (parent.total_size, 2) giving the start
-        and one past the stop layer for every entity.  An entry
-        ``a, b = layers[e, ...]`` means that the layers for entity
-        ``e`` run over :math:`[a, b)`.
-
-    The number of layers indicates the number of time the base set is
-    extruded in the direction of the :class:`ExtrudedSet`.  As a
-    result, there are ``layers-1`` extruded "cells" in an extruded set.
-    """
-
-    @validate_type(('parent', Set, TypeError))
-    def __init__(self, parent, layers):
-        self._parent = parent
-        try:
-            layers = verify_reshape(layers, IntType, (parent.total_size, 2))
-            self.constant_layers = False
-            if layers.min() < 0:
-                raise SizeTypeError("Bottom of layers must be >= 0")
-            if any(layers[:, 1] - layers[:, 0] < 1):
-                raise SizeTypeError("Number of layers must be >= 0")
-        except DataValueError:
-            # Legacy, integer
-            layers = np.asarray(layers, dtype=IntType)
-            if layers.shape:
-                raise SizeTypeError("Specifying layers per entity, but provided %s, needed (%d, 2)",
-                                    layers.shape, parent.total_size)
-            if layers < 2:
-                raise SizeTypeError("Need at least two layers, not %d", layers)
-            layers = np.asarray([[0, layers]], dtype=IntType)
-            self.constant_layers = True
-
-        self._layers = layers
-        self._extruded = True
-
-    @cached_property
-    def _kernel_args_(self):
-        return (self.layers_array.ctypes.data, )
-
-    @cached_property
-    def _argtypes_(self):
-        return (ctypes.c_voidp, )
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return self.parent._wrapper_cache_key_ + (self.constant_layers, )
-
-    def __getattr__(self, name):
-        """Returns a :class:`Set` specific attribute."""
-        value = getattr(self._parent, name)
-        setattr(self, name, value)
-        return value
-
-    def __contains__(self, set):
-        return set is self.parent
-
-    def __str__(self):
-        return "OP2 ExtrudedSet: %s with size %s (%s layers)" % \
-            (self._name, self.size, self._layers)
-
-    def __repr__(self):
-        return "ExtrudedSet(%r, %r)" % (self._parent, self._layers)
-
-    @cached_property
-    def parent(self):
-        return self._parent
-
-    @cached_property
-    def layers(self):
-        """The layers of this extruded set."""
-        if self.constant_layers:
-            # Backwards compat
-            return self.layers_array[0, 1]
-        else:
-            raise ValueError("No single layer, use layers_array attribute")
-
-    @cached_property
-    def layers_array(self):
-        return self._layers
-
-
-class Subset(ExtrudedSet):
-
-    """OP2 subset.
-
-    :param superset: The superset of the subset.
-    :type superset: a :class:`Set` or a :class:`Subset`.
-    :param indices: Elements of the superset that form the
-        subset. Duplicate values are removed when constructing the subset.
-    :type indices: a list of integers, or a numpy array.
-    """
-    @validate_type(('superset', Set, TypeError),
-                   ('indices', (list, tuple, np.ndarray), TypeError))
-    def __init__(self, superset, indices):
-        # sort and remove duplicates
-        indices = np.unique(indices)
-        if isinstance(superset, Subset):
-            # Unroll indices to point to those in the parent
-            indices = superset.indices[indices]
-            superset = superset.superset
-        assert type(superset) is Set or type(superset) is ExtrudedSet, \
-            'Subset construction failed, should not happen'
-
-        self._superset = superset
-        self._indices = verify_reshape(indices, IntType, (len(indices),))
-
-        if len(self._indices) > 0 and (self._indices[0] < 0 or self._indices[-1] >= self._superset.total_size):
-            raise SubsetIndexOutOfBounds(
-                'Out of bounds indices in Subset construction: [%d, %d) not [0, %d)' %
-                (self._indices[0], self._indices[-1], self._superset.total_size))
-
-        self._sizes = ((self._indices < superset.core_size).sum(),
-                       (self._indices < superset.size).sum(),
-                       len(self._indices))
-        self._extruded = superset._extruded
-
-    @cached_property
-    def _kernel_args_(self):
-        return self._superset._kernel_args_ + (self._indices.ctypes.data, )
-
-    @cached_property
-    def _argtypes_(self):
-        return self._superset._argtypes_ + (ctypes.c_voidp, )
-
-    # Look up any unspecified attributes on the _set.
-    def __getattr__(self, name):
-        """Returns a :class:`Set` specific attribute."""
-        value = getattr(self._superset, name)
-        setattr(self, name, value)
-        return value
-
-    def __pow__(self, e):
-        """Derive a :class:`DataSet` with dimension ``e``"""
-        raise NotImplementedError("Deriving a DataSet from a Subset is unsupported")
-
-    def __str__(self):
-        return "OP2 Subset: %s with sizes %s" % \
-            (self._name, self._sizes)
-
-    def __repr__(self):
-        return "Subset(%r, %r)" % (self._superset, self._indices)
-
-    def __call__(self, *indices):
-        """Build a :class:`Subset` from this :class:`Subset`
-
-        :arg indices: The elements of this :class:`Subset` from which the
-                      :class:`Subset` should be formed.
-
-        """
-        if len(indices) == 1:
-            indices = indices[0]
-            if np.isscalar(indices):
-                indices = [indices]
-        return _make_object('Subset', self, indices)
-
-    @cached_property
-    def superset(self):
-        """Returns the superset Set"""
-        return self._superset
-
-    @cached_property
-    def indices(self):
-        """Returns the indices pointing in the superset."""
-        return self._indices
-
-    @cached_property
-    def owned_indices(self):
-        """Return the indices that correspond to the owned entities of the
-        superset.
-        """
-        return self.indices[self.indices < self.superset.size]
-
-    @cached_property
-    def layers_array(self):
-        if self._superset.constant_layers:
-            return self._superset.layers_array
-        else:
-            return self._superset.layers_array[self.indices, ...]
-
-    def _check_operands(self, other):
-        if type(other) is Set:
-            if other is not self._superset:
-                raise TypeError("Superset mismatch: self._superset (%s) != other (%s)" % (self._superset, other))
-        elif type(other) is Subset:
-            if self._superset is not other._superset:
-                raise TypeError("Unable to perform set operation between subsets of mismatching supersets (%s != %s)" % (self._superset, other._superset))
-        else:
-            raise TypeError("Unable to perform set operations between `Subset` and %s." % (type(other), ))
-
-    def intersection(self, other):
-        self._check_operands(other)
-        if other is self._superset:
-            return self
-        else:
-            return type(self)(self._superset, np.intersect1d(self._indices, other._indices))
-
-    def union(self, other):
-        self._check_operands(other)
-        if other is self._superset:
-            return other
-        else:
-            return type(self)(self._superset, np.union1d(self._indices, other._indices))
-
-    def difference(self, other):
-        self._check_operands(other)
-        if other is self._superset:
-            return Subset(other, [])
-        else:
-            return type(self)(self._superset, np.setdiff1d(self._indices, other._indices))
-
-    def symmetric_difference(self, other):
-        self._check_operands(other)
-        if other is self._superset:
-            return other.symmetric_difference(self)
-        else:
-            return type(self)(self._superset, np.setxor1d(self._indices, other._indices))
-
-
-class SetPartition(object):
-    def __init__(self, set, offset, size):
-        self.set = set
-        self.offset = offset
-        self.size = size
-
-
-class MixedSet(Set, ObjectCached):
-    r"""A container for a bag of :class:`Set`\s."""
-
-    def __init__(self, sets):
-        r""":param iterable sets: Iterable of :class:`Set`\s or :class:`ExtrudedSet`\s"""
-        if self._initialized:
-            return
-        self._sets = sets
-        assert all(s is None or isinstance(s, GlobalSet) or ((s.layers == self._sets[0].layers).all() if s.layers is not None else True) for s in sets), \
-            "All components of a MixedSet must have the same number of layers."
-        # TODO: do all sets need the same communicator?
-        self.comm = reduce(lambda a, b: a or b, map(lambda s: s if s is None else s.comm, sets))
-        self._initialized = True
-
-    @cached_property
-    def _kernel_args_(self):
-        raise NotImplementedError
-
-    @cached_property
-    def _argtypes_(self):
-        raise NotImplementedError
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        raise NotImplementedError
-
-    @classmethod
-    def _process_args(cls, sets, **kwargs):
-        sets = [s for s in sets]
-        try:
-            sets = as_tuple(sets, ExtrudedSet)
-        except TypeError:
-            sets = as_tuple(sets, (Set, type(None)))
-        cache = sets[0]
-        return (cache, ) + (sets, ), kwargs
-
-    @classmethod
-    def _cache_key(cls, sets, **kwargs):
-        return sets
-
-    def __getitem__(self, idx):
-        """Return :class:`Set` with index ``idx`` or a given slice of sets."""
-        return self._sets[idx]
-
-    @cached_property
-    def split(self):
-        r"""The underlying tuple of :class:`Set`\s."""
-        return self._sets
-
-    @cached_property
-    def core_size(self):
-        """Core set size. Owned elements not touching halo elements."""
-        return sum(s.core_size for s in self._sets)
-
-    @cached_property
-    def size(self):
-        """Set size, owned elements."""
-        return sum(0 if s is None else s.size for s in self._sets)
-
-    @cached_property
-    def total_size(self):
-        """Total set size, including halo elements."""
-        return sum(s.total_size for s in self._sets)
-
-    @cached_property
-    def sizes(self):
-        """Set sizes: core, owned, execute halo, total."""
-        return (self.core_size, self.size, self.total_size)
-
-    @cached_property
-    def name(self):
-        """User-defined labels."""
-        return tuple(s.name for s in self._sets)
-
-    @cached_property
-    def halo(self):
-        r""":class:`Halo`\s associated with these :class:`Set`\s."""
-        halos = tuple(s.halo for s in self._sets)
-        return halos if any(halos) else None
-
-    @cached_property
-    def _extruded(self):
-        return isinstance(self._sets[0], ExtrudedSet)
-
-    @cached_property
-    def layers(self):
-        """Numbers of layers in the extruded mesh (or None if this MixedSet is not extruded)."""
-        return self._sets[0].layers
-
-    def __iter__(self):
-        r"""Yield all :class:`Set`\s when iterated over."""
-        for s in self._sets:
-            yield s
-
-    def __len__(self):
-        """Return number of contained :class:`Set`s."""
-        return len(self._sets)
-
-    def __pow__(self, e):
-        """Derive a :class:`MixedDataSet` with dimensions ``e``"""
-        return _make_object('MixedDataSet', self._sets, e)
-
-    def __str__(self):
-        return "OP2 MixedSet composed of Sets: %s" % (self._sets,)
-
-    def __repr__(self):
-        return "MixedSet(%r)" % (self._sets,)
-
-    def __eq__(self, other):
-        return type(self) == type(other) and self._sets == other._sets
-
-
-class DataSet(ObjectCached):
-    """PyOP2 Data Set
-
-    Set used in the op2.Dat structures to specify the dimension of the data.
-    """
-
-    @validate_type(('iter_set', Set, SetTypeError),
-                   ('dim', (numbers.Integral, tuple, list), DimTypeError),
-                   ('name', str, NameTypeError))
-    def __init__(self, iter_set, dim=1, name=None):
-        if isinstance(iter_set, ExtrudedSet):
-            raise NotImplementedError("Not allowed!")
-        if self._initialized:
-            return
-        if isinstance(iter_set, Subset):
-            raise NotImplementedError("Deriving a DataSet from a Subset is unsupported")
-        self._set = iter_set
-        self._dim = as_tuple(dim, numbers.Integral)
-        self._cdim = np.prod(self._dim).item()
-        self._name = name or "dset_#x%x" % id(self)
-        self._initialized = True
-
-    @classmethod
-    def _process_args(cls, *args, **kwargs):
-        return (args[0], ) + args, kwargs
-
-    @classmethod
-    def _cache_key(cls, iter_set, dim=1, name=None):
-        return (iter_set, as_tuple(dim, numbers.Integral))
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.dim, self._set._wrapper_cache_key_)
-
-    def __getstate__(self):
-        """Extract state to pickle."""
-        return self.__dict__
-
-    def __setstate__(self, d):
-        """Restore from pickled state."""
-        self.__dict__.update(d)
-
-    # Look up any unspecified attributes on the _set.
-    def __getattr__(self, name):
-        """Returns a Set specific attribute."""
-        value = getattr(self.set, name)
-        setattr(self, name, value)
-        return value
-
-    def __getitem__(self, idx):
-        """Allow index to return self"""
-        assert idx == 0
-        return self
-
-    @cached_property
-    def dim(self):
-        """The shape tuple of the values for each element of the set."""
-        return self._dim
-
-    @cached_property
-    def cdim(self):
-        """The scalar number of values for each member of the set. This is
-        the product of the dim tuple."""
-        return self._cdim
-
-    @cached_property
-    def name(self):
-        """Returns the name of the data set."""
-        return self._name
-
-    @cached_property
-    def set(self):
-        """Returns the parent set of the data set."""
-        return self._set
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __str__(self):
-        return "OP2 DataSet: %s on set %s, with dim %s" % \
-            (self._name, self._set, self._dim)
-
-    def __repr__(self):
-        return "DataSet(%r, %r, %r)" % (self._set, self._dim, self._name)
-
-    def __contains__(self, dat):
-        """Indicate whether a given Dat is compatible with this DataSet."""
-        return dat.dataset == self
-
-
-class GlobalDataSet(DataSet):
-    """A proxy :class:`DataSet` for use in a :class:`Sparsity` where the
-    matrix has :class:`Global` rows or columns."""
-
-    def __init__(self, global_):
-        """
-        :param global_: The :class:`Global` on which this object is based."""
-
-        self._global = global_
-        self._globalset = GlobalSet(comm=self.comm)
-        self._name = "gdset_#x%x" % id(self)
-
-    @classmethod
-    def _cache_key(cls, *args):
-        return None
-
-    @cached_property
-    def dim(self):
-        """The shape tuple of the values for each element of the set."""
-        return self._global._dim
-
-    @cached_property
-    def cdim(self):
-        """The scalar number of values for each member of the set. This is
-        the product of the dim tuple."""
-        return self._global._cdim
-
-    @cached_property
-    def name(self):
-        """Returns the name of the data set."""
-        return self._global._name
-
-    @cached_property
-    def comm(self):
-        """Return the communicator on which the set is defined."""
-        return self._global.comm
-
-    @cached_property
-    def set(self):
-        """Returns the parent set of the data set."""
-        return self._globalset
-
-    @cached_property
-    def size(self):
-        """The number of local entries in the Dataset (1 on rank 0)"""
-        return 1 if MPI.comm.rank == 0 else 0
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __str__(self):
-        return "OP2 GlobalDataSet: %s on Global %s" % \
-            (self._name, self._global)
-
-    def __repr__(self):
-        return "GlobalDataSet(%r)" % (self._global)
-
-
-class MixedDataSet(DataSet, ObjectCached):
-    r"""A container for a bag of :class:`DataSet`\s.
-
-    Initialized either from a :class:`MixedSet` and an iterable or iterator of
-    ``dims`` of corresponding length ::
-
-        mdset = op2.MixedDataSet(mset, [dim1, ..., dimN])
-
-    or from a tuple of :class:`Set`\s and an iterable of ``dims`` of
-    corresponding length ::
-
-        mdset = op2.MixedDataSet([set1, ..., setN], [dim1, ..., dimN])
-
-    If all ``dims`` are to be the same, they can also be given as an
-    :class:`int` for either of above invocations ::
-
-        mdset = op2.MixedDataSet(mset, dim)
-        mdset = op2.MixedDataSet([set1, ..., setN], dim)
-
-    Initialized from a :class:`MixedSet` without explicitly specifying ``dims``
-    they default to 1 ::
-
-        mdset = op2.MixedDataSet(mset)
-
-    Initialized from an iterable or iterator of :class:`DataSet`\s and/or
-    :class:`Set`\s, where :class:`Set`\s are implicitly upcast to
-    :class:`DataSet`\s of dim 1 ::
-
-        mdset = op2.MixedDataSet([dset1, ..., dsetN])
-    """
-
-    def __init__(self, arg, dims=None):
-        r"""
-        :param arg:  a :class:`MixedSet` or an iterable or a generator
-                     expression of :class:`Set`\s or :class:`DataSet`\s or a
-                     mixture of both
-        :param dims: `None` (the default) or an :class:`int` or an iterable or
-                     generator expression of :class:`int`\s, which **must** be
-                     of same length as `arg`
-
-        .. Warning ::
-            When using generator expressions for ``arg`` or ``dims``, these
-            **must** terminate or else will cause an infinite loop.
-        """
-        if self._initialized:
-            return
-        self._dsets = arg
-        self._initialized = True
-
-    @classmethod
-    def _process_args(cls, arg, dims=None):
-        # If the second argument is not None it is expect to be a scalar dim
-        # or an iterable of dims and the first is expected to be a MixedSet or
-        # an iterable of Sets
-        if dims is not None:
-            # If arg is a MixedSet, get its Sets tuple
-            sets = arg.split if isinstance(arg, MixedSet) else tuple(arg)
-            # If dims is a scalar, turn it into a tuple of right length
-            dims = (dims,) * len(sets) if isinstance(dims, int) else tuple(dims)
-            if len(sets) != len(dims):
-                raise ValueError("Got MixedSet of %d Sets but %s dims" %
-                                 (len(sets), len(dims)))
-            dsets = tuple(s ** d for s, d in zip(sets, dims))
-        # Otherwise expect the first argument to be an iterable of Sets and/or
-        # DataSets and upcast Sets to DataSets as necessary
-        else:
-            arg = [s if isinstance(s, DataSet) else s ** 1 for s in arg]
-            dsets = as_tuple(arg, type=DataSet)
-
-        return (dsets[0].set, ) + (dsets, ), {}
-
-    @classmethod
-    def _cache_key(cls, arg, dims=None):
-        return arg
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        raise NotImplementedError
-
-    def __getitem__(self, idx):
-        """Return :class:`DataSet` with index ``idx`` or a given slice of datasets."""
-        return self._dsets[idx]
-
-    @cached_property
-    def split(self):
-        r"""The underlying tuple of :class:`DataSet`\s."""
-        return self._dsets
-
-    @cached_property
-    def dim(self):
-        """The shape tuple of the values for each element of the sets."""
-        return tuple(s.dim for s in self._dsets)
-
-    @cached_property
-    def cdim(self):
-        """The sum of the scalar number of values for each member of the sets.
-        This is the sum of products of the dim tuples."""
-        return sum(s.cdim for s in self._dsets)
-
-    @cached_property
-    def name(self):
-        """Returns the name of the data sets."""
-        return tuple(s.name for s in self._dsets)
-
-    @cached_property
-    def set(self):
-        """Returns the :class:`MixedSet` this :class:`MixedDataSet` is
-        defined on."""
-        return MixedSet(s.set for s in self._dsets)
-
-    def __iter__(self):
-        r"""Yield all :class:`DataSet`\s when iterated over."""
-        for ds in self._dsets:
-            yield ds
-
-    def __len__(self):
-        """Return number of contained :class:`DataSet`s."""
-        return len(self._dsets)
-
-    def __str__(self):
-        return "OP2 MixedDataSet composed of DataSets: %s" % (self._dsets,)
-
-    def __repr__(self):
-        return "MixedDataSet(%r)" % (self._dsets,)
-
-
-class Halo(object, metaclass=abc.ABCMeta):
-
-    """A description of a halo associated with a :class:`Set`.
-
-    The halo object describes which :class:`Set` elements are sent
-    where, and which :class:`Set` elements are received from where.
-    """
-
-    @abc.abstractproperty
-    def comm(self):
-        """The MPI communicator for this halo."""
-        pass
-
-    @abc.abstractproperty
-    def local_to_global_numbering(self):
-        """The mapping from process-local to process-global numbers for this halo."""
-        pass
-
-    @abc.abstractmethod
-    def global_to_local_begin(self, dat, insert_mode):
-        """Begin an exchange from global (assembled) to local (ghosted) representation.
-
-        :arg dat: The :class:`Dat` to exchange.
-        :arg insert_mode: The insertion mode.
-        """
-        pass
-
-    @abc.abstractmethod
-    def global_to_local_end(self, dat, insert_mode):
-        """Finish an exchange from global (assembled) to local (ghosted) representation.
-
-        :arg dat: The :class:`Dat` to exchange.
-        :arg insert_mode: The insertion mode.
-        """
-        pass
-
-    @abc.abstractmethod
-    def local_to_global_begin(self, dat, insert_mode):
-        """Begin an exchange from local (ghosted) to global (assembled) representation.
-
-        :arg dat: The :class:`Dat` to exchange.
-        :arg insert_mode: The insertion mode.
-        """
-        pass
-
-    @abc.abstractmethod
-    def local_to_global_end(self, dat, insert_mode):
-        """Finish an exchange from local (ghosted) to global (assembled) representation.
-
-        :arg dat: The :class:`Dat` to exchange.
-        :arg insert_mode: The insertion mode.
-        """
-        pass
-
-
-class DataCarrier(object):
-
-    """Abstract base class for OP2 data.
-
-    Actual objects will be :class:`DataCarrier` objects of rank 0
-    (:class:`Global`), rank 1 (:class:`Dat`), or rank 2
-    (:class:`Mat`)"""
-
-    @cached_property
-    def dtype(self):
-        """The Python type of the data."""
-        return self._data.dtype
-
-    @cached_property
-    def ctype(self):
-        """The c type of the data."""
-        return as_cstr(self.dtype)
-
-    @cached_property
-    def name(self):
-        """User-defined label."""
-        return self._name
-
-    @cached_property
-    def dim(self):
-        """The shape tuple of the values for each element of the object."""
-        return self._dim
-
-    @cached_property
-    def cdim(self):
-        """The scalar number of values for each member of the object. This is
-        the product of the dim tuple."""
-        return self._cdim
-
-
-class _EmptyDataMixin(object):
-    """A mixin for :class:`Dat` and :class:`Global` objects that takes
-    care of allocating data on demand if the user has passed nothing
-    in.
-
-    Accessing the :attr:`_data` property allocates a zeroed data array
-    if it does not already exist.
-    """
-    def __init__(self, data, dtype, shape):
-        if data is None:
-            self._dtype = np.dtype(dtype if dtype is not None else ScalarType)
-        else:
-            self._numpy_data = verify_reshape(data, dtype, shape, allow_none=True)
-            self._dtype = self._data.dtype
-
-    @cached_property
-    def _data(self):
-        """Return the user-provided data buffer, or a zeroed buffer of
-        the correct size if none was provided."""
-        if not self._is_allocated:
-            self._numpy_data = np.zeros(self.shape, dtype=self._dtype)
-        return self._numpy_data
-
-    @property
-    def _is_allocated(self):
-        """Return True if the data buffer has been allocated."""
-        return hasattr(self, '_numpy_data')
-
-
-class Dat(DataCarrier, _EmptyDataMixin):
-    """OP2 vector data. A :class:`Dat` holds values on every element of a
-    :class:`DataSet`.
-
-    If a :class:`Set` is passed as the ``dataset`` argument, rather
-    than a :class:`DataSet`, the :class:`Dat` is created with a default
-    :class:`DataSet` dimension of 1.
-
-    If a :class:`Dat` is passed as the ``dataset`` argument, a copy is
-    returned.
-
-    It is permissible to pass `None` as the `data` argument.  In this
-    case, allocation of the data buffer is postponed until it is
-    accessed.
-
-    .. note::
-        If the data buffer is not passed in, it is implicitly
-        initialised to be zero.
-
-    When a :class:`Dat` is passed to :func:`pyop2.op2.par_loop`, the map via
-    which indirection occurs and the access descriptor are passed by
-    calling the :class:`Dat`. For instance, if a :class:`Dat` named ``D`` is
-    to be accessed for reading via a :class:`Map` named ``M``, this is
-    accomplished by ::
-
-      D(pyop2.READ, M)
-
-    The :class:`Map` through which indirection occurs can be indexed
-    using the index notation described in the documentation for the
-    :class:`Map`. Direct access to a Dat is accomplished by
-    omitting the path argument.
-
-    :class:`Dat` objects support the pointwise linear algebra operations
-    ``+=``, ``*=``, ``-=``, ``/=``, where ``*=`` and ``/=`` also support
-    multiplication / division by a scalar.
-    """
-
-    _zero_kernels = {}
-    """Class-level cache for zero kernels."""
-
-    _modes = [READ, WRITE, RW, INC, MIN, MAX]
-
-    @cached_property
-    def pack(self):
-        from pyop2.codegen.builder import DatPack
-        return DatPack
-
-    @validate_type(('dataset', (DataCarrier, DataSet, Set), DataSetTypeError),
-                   ('name', str, NameTypeError))
-    @validate_dtype(('dtype', None, DataTypeError))
-    def __init__(self, dataset, data=None, dtype=None, name=None):
-
-        if isinstance(dataset, Dat):
-            self.__init__(dataset.dataset, None, dtype=dataset.dtype,
-                          name="copy_of_%s" % dataset.name)
-            dataset.copy(self)
-            return
-        if type(dataset) is Set or type(dataset) is ExtrudedSet:
-            # If a Set, rather than a dataset is passed in, default to
-            # a dataset dimension of 1.
-            dataset = dataset ** 1
-        self._shape = (dataset.total_size,) + (() if dataset.cdim == 1 else dataset.dim)
-        _EmptyDataMixin.__init__(self, data, dtype, self._shape)
-
-        self._dataset = dataset
-        self.comm = dataset.comm
-        self.halo_valid = True
-        self._name = name or "dat_#x%x" % id(self)
-
-    @cached_property
-    def _kernel_args_(self):
-        return (self._data.ctypes.data, )
-
-    @cached_property
-    def _argtypes_(self):
-        return (ctypes.c_voidp, )
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.dtype, self._dataset._wrapper_cache_key_)
-
-    @validate_in(('access', _modes, ModeValueError))
-    def __call__(self, access, path=None):
-        if configuration["type_check"] and path and path.toset != self.dataset.set:
-            raise MapValueError("To Set of Map does not match Set of Dat.")
-        return _make_object('Arg', data=self, map=path, access=access)
-
-    def __getitem__(self, idx):
-        """Return self if ``idx`` is 0, raise an error otherwise."""
-        if idx != 0:
-            raise IndexValueError("Can only extract component 0 from %r" % self)
-        return self
-
-    @cached_property
-    def split(self):
-        """Tuple containing only this :class:`Dat`."""
-        return (self,)
-
-    @cached_property
-    def dataset(self):
-        """:class:`DataSet` on which the Dat is defined."""
-        return self._dataset
-
-    @cached_property
-    def dim(self):
-        """The shape of the values for each element of the object."""
-        return self.dataset.dim
-
-    @cached_property
-    def cdim(self):
-        """The scalar number of values for each member of the object. This is
-        the product of the dim tuple."""
-        return self.dataset.cdim
-
-    @property
-    @collective
-    def data(self):
-        """Numpy array containing the data values.
-
-        With this accessor you are claiming that you will modify
-        the values you get back.  If you only need to look at the
-        values, use :meth:`data_ro` instead.
-
-        This only shows local values, to see the halo values too use
-        :meth:`data_with_halos`.
-
-        """
-        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
-            raise RuntimeError("Illegal access: no data associated with this Dat!")
-        self.halo_valid = False
-        v = self._data[:self.dataset.size].view()
-        v.setflags(write=True)
-        return v
-
-    @property
-    @collective
-    def data_with_halos(self):
-        r"""A view of this :class:`Dat`\s data.
-
-        This accessor marks the :class:`Dat` as dirty, see
-        :meth:`data` for more details on the semantics.
-
-        With this accessor, you get to see up to date halo values, but
-        you should not try and modify them, because they will be
-        overwritten by the next halo exchange."""
-        self.global_to_local_begin(RW)
-        self.global_to_local_end(RW)
-        self.halo_valid = False
-        v = self._data.view()
-        v.setflags(write=True)
-        return v
-
-    @property
-    @collective
-    def data_ro(self):
-        """Numpy array containing the data values.  Read-only.
-
-        With this accessor you are not allowed to modify the values
-        you get back.  If you need to do so, use :meth:`data` instead.
-
-        This only shows local values, to see the halo values too use
-        :meth:`data_ro_with_halos`.
-
-        """
-        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
-            raise RuntimeError("Illegal access: no data associated with this Dat!")
-        v = self._data[:self.dataset.size].view()
-        v.setflags(write=False)
-        return v
-
-    @property
-    @collective
-    def data_ro_with_halos(self):
-        r"""A view of this :class:`Dat`\s data.
-
-        This accessor does not mark the :class:`Dat` as dirty, and is
-        a read only view, see :meth:`data_ro` for more details on the
-        semantics.
-
-        With this accessor, you get to see up to date halo values, but
-        you should not try and modify them, because they will be
-        overwritten by the next halo exchange.
-
-        """
-        self.global_to_local_begin(READ)
-        self.global_to_local_end(READ)
-        v = self._data.view()
-        v.setflags(write=False)
-        return v
-
-    def save(self, filename):
-        """Write the data array to file ``filename`` in NumPy format."""
-        np.save(filename, self.data_ro)
-
-    def load(self, filename):
-        """Read the data stored in file ``filename`` into a NumPy array
-        and store the values in :meth:`_data`.
-        """
-        # The np.save method appends a .npy extension to the file name
-        # if the user has not supplied it. However, np.load does not,
-        # so we need to handle this ourselves here.
-        if(filename[-4:] != ".npy"):
-            filename = filename + ".npy"
-
-        if isinstance(self.data, tuple):
-            # MixedDat case
-            for d, d_from_file in zip(self.data, np.load(filename)):
-                d[:] = d_from_file[:]
-        else:
-            self.data[:] = np.load(filename)
-
-    @cached_property
-    def shape(self):
-        return self._shape
-
-    @cached_property
-    def dtype(self):
-        return self._dtype
-
-    @cached_property
-    def nbytes(self):
-        """Return an estimate of the size of the data associated with this
-        :class:`Dat` in bytes. This will be the correct size of the data
-        payload, but does not take into account the (presumably small)
-        overhead of the object and its metadata.
-
-        Note that this is the process local memory usage, not the sum
-        over all MPI processes.
-        """
-
-        return self.dtype.itemsize * self.dataset.total_size * self.dataset.cdim
-
-    @collective
-    def zero(self, subset=None):
-        """Zero the data associated with this :class:`Dat`
-
-        :arg subset: A :class:`Subset` of entries to zero (optional)."""
-        # If there is no subset we can safely zero the halo values.
-        if subset is None:
-            self._data[:] = 0
-            self.halo_valid = True
-        elif subset.superset != self.dataset.set:
-            raise MapValueError("The subset and dataset are incompatible")
-        else:
-            self.data[subset.owned_indices] = 0
-
-    @collective
-    def copy(self, other, subset=None):
-        """Copy the data in this :class:`Dat` into another.
-
-        :arg other: The destination :class:`Dat`
-        :arg subset: A :class:`Subset` of elements to copy (optional)"""
-        if other is self:
-            return
-        if subset is None:
-            # If the current halo is valid we can also copy these values across.
-            if self.halo_valid:
-                other._data[:] = self._data
-                other.halo_valid = True
-            else:
-                other.data[:] = self.data_ro
-        elif subset.superset != self.dataset.set:
-            raise MapValueError("The subset and dataset are incompatible")
-        else:
-            other.data[subset.owned_indices] = self.data_ro[subset.owned_indices]
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __str__(self):
-        return "OP2 Dat: %s on (%s) with datatype %s" \
-               % (self._name, self._dataset, self.dtype.name)
-
-    def __repr__(self):
-        return "Dat(%r, None, %r, %r)" \
-               % (self._dataset, self.dtype, self._name)
-
-    def _check_shape(self, other):
-        if other.dataset.dim != self.dataset.dim:
-            raise ValueError('Mismatched shapes in operands %s and %s',
-                             self.dataset.dim, other.dataset.dim)
-
-    def _op_kernel(self, op, globalp, dtype):
-        key = (op, globalp, dtype)
-        try:
-            if not hasattr(self, "_op_kernel_cache"):
-                self._op_kernel_cache = {}
-            return self._op_kernel_cache[key]
-        except KeyError:
-            pass
-        import islpy as isl
-        import pymbolic.primitives as p
-        name = "binop_%s" % op.__name__
-        inames = isl.make_zero_and_vars(["i"])
-        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-        _other = p.Variable("other")
-        _self = p.Variable("self")
-        _ret = p.Variable("ret")
-        i = p.Variable("i")
-        lhs = _ret.index(i)
-        if globalp:
-            rhs = _other.index(0)
-            rshape = (1, )
-        else:
-            rhs = _other.index(i)
-            rshape = (self.cdim, )
-        insn = loopy.Assignment(lhs, op(_self.index(i), rhs), within_inames=frozenset(["i"]))
-        data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)),
-                loopy.GlobalArg("other", dtype=dtype, shape=rshape),
-                loopy.GlobalArg("ret", dtype=self.dtype, shape=(self.cdim,))]
-        knl = loopy.make_function([domain], [insn], data, name=name, target=loopy.CTarget(), lang_version=(2018, 2))
-        return self._op_kernel_cache.setdefault(key, _make_object('Kernel', knl, name))
-
-    def _op(self, other, op):
-        ret = _make_object('Dat', self.dataset, None, self.dtype)
-        if np.isscalar(other):
-            other = _make_object('Global', 1, data=other)
-            globalp = True
-        else:
-            self._check_shape(other)
-            globalp = False
-        par_loop(self._op_kernel(op, globalp, other.dtype),
-                 self.dataset.set, self(READ), other(READ), ret(WRITE))
-        return ret
-
-    def _iop_kernel(self, op, globalp, other_is_self, dtype):
-        key = (op, globalp, other_is_self, dtype)
-        try:
-            if not hasattr(self, "_iop_kernel_cache"):
-                self._iop_kernel_cache = {}
-            return self._iop_kernel_cache[key]
-        except KeyError:
-            pass
-        import islpy as isl
-        import pymbolic.primitives as p
-        name = "iop_%s" % op.__name__
-        inames = isl.make_zero_and_vars(["i"])
-        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-        _other = p.Variable("other")
-        _self = p.Variable("self")
-        i = p.Variable("i")
-        lhs = _self.index(i)
-        rshape = (self.cdim, )
-        if globalp:
-            rhs = _other.index(0)
-            rshape = (1, )
-        elif other_is_self:
-            rhs = _self.index(i)
-        else:
-            rhs = _other.index(i)
-        insn = loopy.Assignment(lhs, op(lhs, rhs), within_inames=frozenset(["i"]))
-        data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,))]
-        if not other_is_self:
-            data.append(loopy.GlobalArg("other", dtype=dtype, shape=rshape))
-        knl = loopy.make_function([domain], [insn], data, name=name, target=loopy.CTarget(), lang_version=(2018, 2))
-        return self._iop_kernel_cache.setdefault(key, _make_object('Kernel', knl, name))
-
-    def _iop(self, other, op):
-        globalp = False
-        if np.isscalar(other):
-            other = _make_object('Global', 1, data=other)
-            globalp = True
-        elif other is not self:
-            self._check_shape(other)
-        args = [self(INC)]
-        if other is not self:
-            args.append(other(READ))
-        par_loop(self._iop_kernel(op, globalp, other is self, other.dtype), self.dataset.set, *args)
-        return self
-
-    def _inner_kernel(self, dtype):
-        try:
-            if not hasattr(self, "_inner_kernel_cache"):
-                self._inner_kernel_cache = {}
-            return self._inner_kernel_cache[dtype]
-        except KeyError:
-            pass
-        import islpy as isl
-        import pymbolic.primitives as p
-        inames = isl.make_zero_and_vars(["i"])
-        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-        _self = p.Variable("self")
-        _other = p.Variable("other")
-        _ret = p.Variable("ret")
-        _conj = p.Variable("conj") if dtype.kind == "c" else lambda x: x
-        i = p.Variable("i")
-        insn = loopy.Assignment(_ret[0], _ret[0] + _self[i]*_conj(_other[i]),
-                                within_inames=frozenset(["i"]))
-        data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)),
-                loopy.GlobalArg("other", dtype=dtype, shape=(self.cdim,)),
-                loopy.GlobalArg("ret", dtype=self.dtype, shape=(1,))]
-        knl = loopy.make_function([domain], [insn], data, name="inner", target=loopy.CTarget(), lang_version=(2018, 2))
-        k = _make_object('Kernel', knl, "inner")
-        return self._inner_kernel_cache.setdefault(dtype, k)
-
-    def inner(self, other):
-        """Compute the l2 inner product of the flattened :class:`Dat`
-
-        :arg other: the other :class:`Dat` to compute the inner
-             product against. The complex conjugate of this is taken.
-
-        """
-        self._check_shape(other)
-        ret = _make_object('Global', 1, data=0, dtype=self.dtype)
-        par_loop(self._inner_kernel(other.dtype), self.dataset.set,
-                 self(READ), other(READ), ret(INC))
-        return ret.data_ro[0]
-
-    @property
-    def norm(self):
-        """Compute the l2 norm of this :class:`Dat`
-
-        .. note::
-
-           This acts on the flattened data (see also :meth:`inner`)."""
-        from math import sqrt
-        return sqrt(self.inner(self).real)
-
-    def __pos__(self):
-        pos = _make_object('Dat', self)
-        return pos
-
-    def __add__(self, other):
-        """Pointwise addition of fields."""
-        return self._op(other, operator.add)
-
-    def __radd__(self, other):
-        """Pointwise addition of fields.
-
-        self.__radd__(other) <==> other + self."""
-        return self + other
-
-    @cached_property
-    def _neg_kernel(self):
-        # Copy and negate in one go.
-        import islpy as isl
-        import pymbolic.primitives as p
-        name = "neg"
-        inames = isl.make_zero_and_vars(["i"])
-        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
-        lvalue = p.Variable("other")
-        rvalue = p.Variable("self")
-        i = p.Variable("i")
-        insn = loopy.Assignment(lvalue.index(i), -rvalue.index(i), within_inames=frozenset(["i"]))
-        data = [loopy.GlobalArg("other", dtype=self.dtype, shape=(self.cdim,)),
-                loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,))]
-        knl = loopy.make_function([domain], [insn], data, name=name, target=loopy.CTarget(), lang_version=(2018, 2))
-        return _make_object('Kernel', knl, name)
-
-    def __neg__(self):
-        neg = _make_object('Dat', self.dataset, dtype=self.dtype)
-        par_loop(self._neg_kernel, self.dataset.set, neg(WRITE), self(READ))
-        return neg
-
-    def __sub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._op(other, operator.sub)
-
-    def __rsub__(self, other):
-        """Pointwise subtraction of fields.
-
-        self.__rsub__(other) <==> other - self."""
-        ret = -self
-        ret += other
-        return ret
-
-    def __mul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._op(other, operator.mul)
-
-    def __rmul__(self, other):
-        """Pointwise multiplication or scaling of fields.
-
-        self.__rmul__(other) <==> other * self."""
-        return self.__mul__(other)
-
-    def __truediv__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._op(other, operator.truediv)
-
-    __div__ = __truediv__  # Python 2 compatibility
-
-    def __iadd__(self, other):
-        """Pointwise addition of fields."""
-        return self._iop(other, operator.iadd)
-
-    def __isub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._iop(other, operator.isub)
-
-    def __imul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._iop(other, operator.imul)
-
-    def __itruediv__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._iop(other, operator.itruediv)
-
-    @collective
-    def global_to_local_begin(self, access_mode):
-        """Begin a halo exchange from global to ghosted representation.
-
-        :kwarg access_mode: Mode with which the data will subsequently
-           be accessed."""
-        halo = self.dataset.halo
-        if halo is None:
-            return
-        if not self.halo_valid and access_mode in {READ, RW}:
-            halo.global_to_local_begin(self, WRITE)
-        elif access_mode in {INC, MIN, MAX}:
-            min_, max_ = dtype_limits(self.dtype)
-            val = {MAX: min_, MIN: max_, INC: 0}[access_mode]
-            self._data[self.dataset.size:] = val
-        else:
-            # WRITE
-            pass
-
-    @collective
-    def global_to_local_end(self, access_mode):
-        """End a halo exchange from global to ghosted representation.
-
-        :kwarg access_mode: Mode with which the data will subsequently
-           be accessed."""
-        halo = self.dataset.halo
-        if halo is None:
-            return
-        if not self.halo_valid and access_mode in {READ, RW}:
-            halo.global_to_local_end(self, WRITE)
-            self.halo_valid = True
-        elif access_mode in {INC, MIN, MAX}:
-            self.halo_valid = False
-        else:
-            # WRITE
-            pass
-
-    @collective
-    def local_to_global_begin(self, insert_mode):
-        """Begin a halo exchange from ghosted to global representation.
-
-        :kwarg insert_mode: insertion mode (an access descriptor)"""
-        halo = self.dataset.halo
-        if halo is None:
-            return
-        halo.local_to_global_begin(self, insert_mode)
-
-    @collective
-    def local_to_global_end(self, insert_mode):
-        """End a halo exchange from ghosted to global representation.
-
-        :kwarg insert_mode: insertion mode (an access descriptor)"""
-        halo = self.dataset.halo
-        if halo is None:
-            return
-        halo.local_to_global_end(self, insert_mode)
-        self.halo_valid = False
-
-
-class DatView(Dat):
-    """An indexed view into a :class:`Dat`.
-
-    This object can be used like a :class:`Dat` but the kernel will
-    only see the requested index, rather than the full data.
-
-    :arg dat: The :class:`Dat` to create a view into.
-    :arg index: The component to select a view of.
-    """
-    def __init__(self, dat, index):
-        index = as_tuple(index)
-        assert len(index) == len(dat.dim)
-        for i, d in zip(index, dat.dim):
-            if not (0 <= i < d):
-                raise IndexValueError("Can't create DatView with index %s for Dat with shape %s" % (index, dat.dim))
-        self.index = index
-        # Point at underlying data
-        super(DatView, self).__init__(dat.dataset,
-                                      dat._data,
-                                      dtype=dat.dtype,
-                                      name="view[%s](%s)" % (index, dat.name))
-        self._parent = dat
-
-    @cached_property
-    def _kernel_args_(self):
-        return self._parent._kernel_args_
-
-    @cached_property
-    def _argtypes_(self):
-        return self._parent._argtypes_
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.index, self._parent._wrapper_cache_key_)
-
-    @cached_property
-    def cdim(self):
-        return 1
-
-    @cached_property
-    def dim(self):
-        return (1, )
-
-    @cached_property
-    def shape(self):
-        return (self.dataset.total_size, )
-
-    @property
-    def data(self):
-        full = self._parent.data
-        idx = (slice(None), *self.index)
-        return full[idx]
-
-    @property
-    def data_ro(self):
-        full = self._parent.data_ro
-        idx = (slice(None), *self.index)
-        return full[idx]
-
-    @property
-    def data_with_halos(self):
-        full = self._parent.data_with_halos
-        idx = (slice(None), *self.index)
-        return full[idx]
-
-    @property
-    def data_ro_with_halos(self):
-        full = self._parent.data_ro_with_halos
-        idx = (slice(None), *self.index)
-        return full[idx]
-
-
-class MixedDat(Dat):
-    r"""A container for a bag of :class:`Dat`\s.
-
-    Initialized either from a :class:`MixedDataSet`, a :class:`MixedSet`, or
-    an iterable of :class:`DataSet`\s and/or :class:`Set`\s, where all the
-    :class:`Set`\s are implcitly upcast to :class:`DataSet`\s ::
-
-        mdat = op2.MixedDat(mdset)
-        mdat = op2.MixedDat([dset1, ..., dsetN])
-
-    or from an iterable of :class:`Dat`\s ::
-
-        mdat = op2.MixedDat([dat1, ..., datN])
-    """
-
-    def __init__(self, mdset_or_dats):
-        def what(x):
-            if isinstance(x, (Global, GlobalDataSet, GlobalSet)):
-                return "Global"
-            elif isinstance(x, (Dat, DataSet, Set)):
-                return "Dat"
-            else:
-                raise DataSetTypeError("Huh?!")
-        if isinstance(mdset_or_dats, MixedDat):
-            self._dats = tuple(_make_object(what(d), d) for d in mdset_or_dats)
-        else:
-            self._dats = tuple(d if isinstance(d, (Dat, Global)) else _make_object(what(d), d) for d in mdset_or_dats)
-        if not all(d.dtype == self._dats[0].dtype for d in self._dats):
-            raise DataValueError('MixedDat with different dtypes is not supported')
-        # TODO: Think about different communicators on dats (c.f. MixedSet)
-        self.comm = self._dats[0].comm
-
-    @cached_property
-    def _kernel_args_(self):
-        return tuple(itertools.chain(*(d._kernel_args_ for d in self)))
-
-    @cached_property
-    def _argtypes_(self):
-        return tuple(itertools.chain(*(d._argtypes_ for d in self)))
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self),) + tuple(d._wrapper_cache_key_ for d in self)
-
-    def __getitem__(self, idx):
-        """Return :class:`Dat` with index ``idx`` or a given slice of Dats."""
-        return self._dats[idx]
-
-    @cached_property
-    def dtype(self):
-        """The NumPy dtype of the data."""
-        return self._dats[0].dtype
-
-    @cached_property
-    def split(self):
-        r"""The underlying tuple of :class:`Dat`\s."""
-        return self._dats
-
-    @cached_property
-    def dataset(self):
-        r""":class:`MixedDataSet`\s this :class:`MixedDat` is defined on."""
-        return _make_object('MixedDataSet', tuple(s.dataset for s in self._dats))
-
-    @cached_property
-    def _data(self):
-        """Return the user-provided data buffer, or a zeroed buffer of
-        the correct size if none was provided."""
-        return tuple(d._data for d in self)
-
-    @property
-    @collective
-    def data(self):
-        """Numpy arrays containing the data excluding halos."""
-        return tuple(s.data for s in self._dats)
-
-    @property
-    @collective
-    def data_with_halos(self):
-        """Numpy arrays containing the data including halos."""
-        return tuple(s.data_with_halos for s in self._dats)
-
-    @property
-    @collective
-    def data_ro(self):
-        """Numpy arrays with read-only data excluding halos."""
-        return tuple(s.data_ro for s in self._dats)
-
-    @property
-    @collective
-    def data_ro_with_halos(self):
-        """Numpy arrays with read-only data including halos."""
-        return tuple(s.data_ro_with_halos for s in self._dats)
-
-    @property
-    def halo_valid(self):
-        """Does this Dat have up to date halos?"""
-        return all(s.halo_valid for s in self)
-
-    @halo_valid.setter
-    def halo_valid(self, val):
-        """Indictate whether this Dat requires a halo update"""
-        for d in self:
-            d.halo_valid = val
-
-    @collective
-    def global_to_local_begin(self, access_mode):
-        for s in self:
-            s.global_to_local_begin(access_mode)
-
-    @collective
-    def global_to_local_end(self, access_mode):
-        for s in self:
-            s.global_to_local_end(access_mode)
-
-    @collective
-    def local_to_global_begin(self, insert_mode):
-        for s in self:
-            s.local_to_global_begin(insert_mode)
-
-    @collective
-    def local_to_global_end(self, insert_mode):
-        for s in self:
-            s.local_to_global_end(insert_mode)
-
-    @collective
-    def zero(self, subset=None):
-        """Zero the data associated with this :class:`MixedDat`.
-
-        :arg subset: optional subset of entries to zero (not implemented)."""
-        if subset is not None:
-            raise NotImplementedError("Subsets of mixed sets not implemented")
-        for d in self._dats:
-            d.zero()
-
-    @cached_property
-    def nbytes(self):
-        """Return an estimate of the size of the data associated with this
-        :class:`MixedDat` in bytes. This will be the correct size of the data
-        payload, but does not take into account the (presumably small)
-        overhead of the object and its metadata.
-
-        Note that this is the process local memory usage, not the sum
-        over all MPI processes.
-        """
-
-        return np.sum([d.nbytes for d in self._dats])
-
-    @collective
-    def copy(self, other, subset=None):
-        """Copy the data in this :class:`MixedDat` into another.
-
-        :arg other: The destination :class:`MixedDat`
-        :arg subset: Subsets are not supported, this must be :class:`None`"""
-
-        if subset is not None:
-            raise NotImplementedError("MixedDat.copy with a Subset is not supported")
-        for s, o in zip(self, other):
-            s.copy(o)
-
-    def __iter__(self):
-        r"""Yield all :class:`Dat`\s when iterated over."""
-        for d in self._dats:
-            yield d
-
-    def __len__(self):
-        r"""Return number of contained :class:`Dats`\s."""
-        return len(self._dats)
-
-    def __hash__(self):
-        return hash(self._dats)
-
-    def __eq__(self, other):
-        r""":class:`MixedDat`\s are equal if all their contained :class:`Dat`\s
-        are."""
-        return type(self) == type(other) and self._dats == other._dats
-
-    def __ne__(self, other):
-        r""":class:`MixedDat`\s are equal if all their contained :class:`Dat`\s
-        are."""
-        return not self.__eq__(other)
-
-    def __str__(self):
-        return "OP2 MixedDat composed of Dats: %s" % (self._dats,)
-
-    def __repr__(self):
-        return "MixedDat(%r)" % (self._dats,)
-
-    def inner(self, other):
-        """Compute the l2 inner product.
-
-        :arg other: the other :class:`MixedDat` to compute the inner product against"""
-        ret = 0
-        for s, o in zip(self, other):
-            ret += s.inner(o)
-        return ret
-
-    def _op(self, other, op):
-        ret = []
-        if np.isscalar(other):
-            for s in self:
-                ret.append(op(s, other))
-        else:
-            self._check_shape(other)
-            for s, o in zip(self, other):
-                ret.append(op(s, o))
-        return _make_object('MixedDat', ret)
-
-    def _iop(self, other, op):
-        if np.isscalar(other):
-            for s in self:
-                op(s, other)
-        else:
-            self._check_shape(other)
-            for s, o in zip(self, other):
-                op(s, o)
-        return self
-
-    def __pos__(self):
-        ret = []
-        for s in self:
-            ret.append(s.__pos__())
-        return _make_object('MixedDat', ret)
-
-    def __neg__(self):
-        ret = []
-        for s in self:
-            ret.append(s.__neg__())
-        return _make_object('MixedDat', ret)
-
-    def __add__(self, other):
-        """Pointwise addition of fields."""
-        return self._op(other, operator.add)
-
-    def __radd__(self, other):
-        """Pointwise addition of fields.
-
-        self.__radd__(other) <==> other + self."""
-        return self._op(other, operator.add)
-
-    def __sub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._op(other, operator.sub)
-
-    def __rsub__(self, other):
-        """Pointwise subtraction of fields.
-
-        self.__rsub__(other) <==> other - self."""
-        return self._op(other, operator.sub)
-
-    def __mul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._op(other, operator.mul)
-
-    def __rmul__(self, other):
-        """Pointwise multiplication or scaling of fields.
-
-        self.__rmul__(other) <==> other * self."""
-        return self._op(other, operator.mul)
-
-    def __div__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._op(other, operator.div)
-
-    def __iadd__(self, other):
-        """Pointwise addition of fields."""
-        return self._iop(other, operator.iadd)
-
-    def __isub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._iop(other, operator.isub)
-
-    def __imul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._iop(other, operator.imul)
-
-    def __idiv__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._iop(other, operator.idiv)
-
-
-class Global(DataCarrier, _EmptyDataMixin):
-
-    """OP2 global value.
-
-    When a ``Global`` is passed to a :func:`pyop2.op2.par_loop`, the access
-    descriptor is passed by `calling` the ``Global``.  For example, if
-    a ``Global`` named ``G`` is to be accessed for reading, this is
-    accomplished by::
-
-      G(pyop2.READ)
-
-    It is permissible to pass `None` as the `data` argument.  In this
-    case, allocation of the data buffer is postponed until it is
-    accessed.
-
-    .. note::
-        If the data buffer is not passed in, it is implicitly
-        initialised to be zero.
-    """
-
-    _modes = [READ, INC, MIN, MAX]
-
-    @validate_type(('name', str, NameTypeError))
-    def __init__(self, dim, data=None, dtype=None, name=None, comm=None):
-        if isinstance(dim, Global):
-            # If g is a Global, Global(g) performs a deep copy. This is for compatibility with Dat.
-            self.__init__(dim._dim, None, dtype=dim.dtype,
-                          name="copy_of_%s" % dim.name, comm=dim.comm)
-            dim.copy(self)
-            return
-        self._dim = as_tuple(dim, int)
-        self._cdim = np.prod(self._dim).item()
-        _EmptyDataMixin.__init__(self, data, dtype, self._dim)
-        self._buf = np.empty(self.shape, dtype=self.dtype)
-        self._name = name or "global_#x%x" % id(self)
-        self.comm = comm
-
-    @cached_property
-    def _kernel_args_(self):
-        return (self._data.ctypes.data, )
-
-    @cached_property
-    def _argtypes_(self):
-        return (ctypes.c_voidp, )
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.dtype, self.shape)
-
-    @validate_in(('access', _modes, ModeValueError))
-    def __call__(self, access, path=None):
-        return _make_object('Arg', data=self, access=access)
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    def __getitem__(self, idx):
-        """Return self if ``idx`` is 0, raise an error otherwise."""
-        if idx != 0:
-            raise IndexValueError("Can only extract component 0 from %r" % self)
-        return self
-
-    def __str__(self):
-        return "OP2 Global Argument: %s with dim %s and value %s" \
-            % (self._name, self._dim, self._data)
-
-    def __repr__(self):
-        return "Global(%r, %r, %r, %r)" % (self._dim, self._data,
-                                           self._data.dtype, self._name)
-
-    @cached_property
-    def dataset(self):
-        return _make_object('GlobalDataSet', self)
-
-    @property
-    def shape(self):
-        return self._dim
-
-    @property
-    def data(self):
-        """Data array."""
-        if len(self._data) == 0:
-            raise RuntimeError("Illegal access: No data associated with this Global!")
-        return self._data
-
-    @property
-    def dtype(self):
-        return self._dtype
-
-    @property
-    def data_ro(self):
-        """Data array."""
-        view = self.data.view()
-        view.setflags(write=False)
-        return view
-
-    @data.setter
-    def data(self, value):
-        self._data[:] = verify_reshape(value, self.dtype, self.dim)
-
-    @property
-    def nbytes(self):
-        """Return an estimate of the size of the data associated with this
-        :class:`Global` in bytes. This will be the correct size of the
-        data payload, but does not take into account the overhead of
-        the object and its metadata. This renders this method of
-        little statistical significance, however it is included to
-        make the interface consistent.
-        """
-
-        return self.dtype.itemsize * self._cdim
-
-    @collective
-    def duplicate(self):
-        """Return a deep copy of self."""
-        return type(self)(self.dim, data=np.copy(self.data_ro),
-                          dtype=self.dtype, name=self.name)
-
-    @collective
-    def copy(self, other, subset=None):
-        """Copy the data in this :class:`Global` into another.
-
-        :arg other: The destination :class:`Global`
-        :arg subset: A :class:`Subset` of elements to copy (optional)"""
-
-        other.data = np.copy(self.data_ro)
-
-    @collective
-    def zero(self):
-        self._data[...] = 0
-
-    @collective
-    def global_to_local_begin(self, access_mode):
-        """Dummy halo operation for the case in which a :class:`Global` forms
-        part of a :class:`MixedDat`."""
-        pass
-
-    @collective
-    def global_to_local_end(self, access_mode):
-        """Dummy halo operation for the case in which a :class:`Global` forms
-        part of a :class:`MixedDat`."""
-        pass
-
-    @collective
-    def local_to_global_begin(self, insert_mode):
-        """Dummy halo operation for the case in which a :class:`Global` forms
-        part of a :class:`MixedDat`."""
-        pass
-
-    @collective
-    def local_to_global_end(self, insert_mode):
-        """Dummy halo operation for the case in which a :class:`Global` forms
-        part of a :class:`MixedDat`."""
-        pass
-
-    def _op(self, other, op):
-        ret = type(self)(self.dim, dtype=self.dtype, name=self.name, comm=self.comm)
-        if isinstance(other, Global):
-            ret.data[:] = op(self.data_ro, other.data_ro)
-        else:
-            ret.data[:] = op(self.data_ro, other)
-        return ret
-
-    def _iop(self, other, op):
-        if isinstance(other, Global):
-            op(self.data[:], other.data_ro)
-        else:
-            op(self.data[:], other)
-        return self
-
-    def __pos__(self):
-        return self.duplicate()
-
-    def __add__(self, other):
-        """Pointwise addition of fields."""
-        return self._op(other, operator.add)
-
-    def __radd__(self, other):
-        """Pointwise addition of fields.
-
-        self.__radd__(other) <==> other + self."""
-        return self + other
-
-    def __neg__(self):
-        return type(self)(self.dim, data=-np.copy(self.data_ro),
-                          dtype=self.dtype, name=self.name)
-
-    def __sub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._op(other, operator.sub)
-
-    def __rsub__(self, other):
-        """Pointwise subtraction of fields.
-
-        self.__rsub__(other) <==> other - self."""
-        ret = -self
-        ret += other
-        return ret
-
-    def __mul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._op(other, operator.mul)
-
-    def __rmul__(self, other):
-        """Pointwise multiplication or scaling of fields.
-
-        self.__rmul__(other) <==> other * self."""
-        return self.__mul__(other)
-
-    def __truediv__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._op(other, operator.truediv)
-
-    def __iadd__(self, other):
-        """Pointwise addition of fields."""
-        return self._iop(other, operator.iadd)
-
-    def __isub__(self, other):
-        """Pointwise subtraction of fields."""
-        return self._iop(other, operator.isub)
-
-    def __imul__(self, other):
-        """Pointwise multiplication or scaling of fields."""
-        return self._iop(other, operator.imul)
-
-    def __itruediv__(self, other):
-        """Pointwise division or scaling of fields."""
-        return self._iop(other, operator.itruediv)
-
-    def inner(self, other):
-        assert isinstance(other, Global)
-        return np.dot(self.data_ro, np.conj(other.data_ro))
-
-
-class Map(object):
-
-    """OP2 map, a relation between two :class:`Set` objects.
-
-    Each entry in the ``iterset`` maps to ``arity`` entries in the
-    ``toset``. When a map is used in a :func:`pyop2.op2.par_loop`, it is
-    possible to use Python index notation to select an individual entry on the
-    right hand side of this map. There are three possibilities:
-
-    * No index. All ``arity`` :class:`Dat` entries will be passed to the
-      kernel.
-    * An integer: ``some_map[n]``. The ``n`` th entry of the
-      map result will be passed to the kernel.
-    """
-
-    dtype = IntType
-
-    @validate_type(('iterset', Set, SetTypeError), ('toset', Set, SetTypeError),
-                   ('arity', numbers.Integral, ArityTypeError), ('name', str, NameTypeError))
-    def __init__(self, iterset, toset, arity, values=None, name=None, offset=None):
-        self._iterset = iterset
-        self._toset = toset
-        self.comm = toset.comm
-        self._arity = arity
-        self._values = verify_reshape(values, IntType,
-                                      (iterset.total_size, arity),
-                                      allow_none=True)
-        self.shape = (iterset.total_size, arity)
-        self._name = name or "map_#x%x" % id(self)
-        if offset is None or len(offset) == 0:
-            self._offset = None
-        else:
-            self._offset = verify_reshape(offset, IntType, (arity, ))
-        # A cache for objects built on top of this map
-        self._cache = {}
-
-    @cached_property
-    def _kernel_args_(self):
-        return (self._values.ctypes.data, )
-
-    @cached_property
-    def _argtypes_(self):
-        return (ctypes.c_voidp, )
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.arity, tuplify(self.offset))
-
-    # This is necessary so that we can convert a Map to a tuple
-    # (needed in as_tuple).  Because, __getitem__ no longer returns a
-    # Map we have to explicitly provide an iterable interface
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __len__(self):
-        """This is not a mixed type and therefore of length 1."""
-        return 1
-
-    @cached_property
-    def split(self):
-        return (self,)
-
-    @cached_property
-    def iterset(self):
-        """:class:`Set` mapped from."""
-        return self._iterset
-
-    @cached_property
-    def toset(self):
-        """:class:`Set` mapped to."""
-        return self._toset
-
-    @cached_property
-    def arity(self):
-        """Arity of the mapping: number of toset elements mapped to per
-        iterset element."""
-        return self._arity
-
-    @cached_property
-    def arities(self):
-        """Arity of the mapping: number of toset elements mapped to per
-        iterset element.
-
-        :rtype: tuple"""
-        return (self._arity,)
-
-    @cached_property
-    def arange(self):
-        """Tuple of arity offsets for each constituent :class:`Map`."""
-        return (0, self._arity)
-
-    @cached_property
-    def values(self):
-        """Mapping array.
-
-        This only returns the map values for local points, to see the
-        halo points too, use :meth:`values_with_halo`."""
-        return self._values[:self.iterset.size]
-
-    @cached_property
-    def values_with_halo(self):
-        """Mapping array.
-
-        This returns all map values (including halo points), see
-        :meth:`values` if you only need to look at the local
-        points."""
-        return self._values
-
-    @cached_property
-    def name(self):
-        """User-defined label"""
-        return self._name
-
-    @cached_property
-    def offset(self):
-        """The vertical offset."""
-        return self._offset
-
-    def __str__(self):
-        return "OP2 Map: %s from (%s) to (%s) with arity %s" \
-               % (self._name, self._iterset, self._toset, self._arity)
-
-    def __repr__(self):
-        return "Map(%r, %r, %r, None, %r)" \
-               % (self._iterset, self._toset, self._arity, self._name)
-
-    def __le__(self, o):
-        """self<=o if o equals self or self._parent <= o."""
-        return self == o
-
-
-class PermutedMap(Map):
-    """Composition of a standard :class:`Map` with a constant permutation.
-
-    :arg map_: The map to permute.
-    :arg permutation: The permutation of the map indices.
-
-    Where normally staging to element data is performed as
-
-    .. code-block::
-
-       local[i] = global[map[i]]
-
-    With a :class:`PermutedMap` we instead get
-
-    .. code-block::
-
-       local[i] = global[map[permutation[i]]]
-
-    This might be useful if your local kernel wants data in a
-    different order to the one that the map provides, and you don't
-    want two global-sized data structures.
-    """
-    def __init__(self, map_, permutation):
-        self.map_ = map_
-        self.permutation = np.asarray(permutation, dtype=Map.dtype)
-        assert (np.unique(permutation) == np.arange(map_.arity, dtype=Map.dtype)).all()
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return super()._wrapper_cache_key_ + (tuple(self.permutation),)
-
-    def __getattr__(self, name):
-        return getattr(self.map_, name)
-
-
-class MixedMap(Map, ObjectCached):
-    r"""A container for a bag of :class:`Map`\s."""
-
-    def __init__(self, maps):
-        r""":param iterable maps: Iterable of :class:`Map`\s"""
-        if self._initialized:
-            return
-        self._maps = maps
-        if not all(m is None or m.iterset == self.iterset for m in self._maps):
-            raise MapTypeError("All maps in a MixedMap need to share the same iterset")
-        # TODO: Think about different communicators on maps (c.f. MixedSet)
-        # TODO: What if all maps are None?
-        comms = tuple(m.comm for m in self._maps if m is not None)
-        if not all(c == comms[0] for c in comms):
-            raise MapTypeError("All maps needs to share a communicator")
-        if len(comms) == 0:
-            raise MapTypeError("Don't know how to make communicator")
-        self.comm = comms[0]
-        self._initialized = True
-
-    @classmethod
-    def _process_args(cls, *args, **kwargs):
-        maps = as_tuple(args[0], type=Map, allow_none=True)
-        cache = maps[0]
-        return (cache, ) + (maps, ), kwargs
-
-    @classmethod
-    def _cache_key(cls, maps):
-        return maps
-
-    @cached_property
-    def _kernel_args_(self):
-        return tuple(itertools.chain(*(m._kernel_args_ for m in self if m is not None)))
-
-    @cached_property
-    def _argtypes_(self):
-        return tuple(itertools.chain(*(m._argtypes_ for m in self if m is not None)))
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return tuple(m._wrapper_cache_key_ for m in self if m is not None)
-
-    @cached_property
-    def split(self):
-        r"""The underlying tuple of :class:`Map`\s."""
-        return self._maps
-
-    @cached_property
-    def iterset(self):
-        """:class:`MixedSet` mapped from."""
-        return reduce(lambda a, b: a or b, map(lambda s: s if s is None else s.iterset, self._maps))
-
-    @cached_property
-    def toset(self):
-        """:class:`MixedSet` mapped to."""
-        return MixedSet(tuple(GlobalSet(comm=self.comm) if m is None else
-                              m.toset for m in self._maps))
-
-    @cached_property
-    def arity(self):
-        """Arity of the mapping: total number of toset elements mapped to per
-        iterset element."""
-        return sum(m.arity for m in self._maps)
-
-    @cached_property
-    def arities(self):
-        """Arity of the mapping: number of toset elements mapped to per
-        iterset element.
-
-        :rtype: tuple"""
-        return tuple(m.arity for m in self._maps)
-
-    @cached_property
-    def arange(self):
-        """Tuple of arity offsets for each constituent :class:`Map`."""
-        return (0,) + tuple(np.cumsum(self.arities))
-
-    @cached_property
-    def values(self):
-        """Mapping arrays excluding data for halos.
-
-        This only returns the map values for local points, to see the
-        halo points too, use :meth:`values_with_halo`."""
-        return tuple(m.values for m in self._maps)
-
-    @cached_property
-    def values_with_halo(self):
-        """Mapping arrays including data for halos.
-
-        This returns all map values (including halo points), see
-        :meth:`values` if you only need to look at the local
-        points."""
-        return tuple(None if m is None else
-                     m.values_with_halo for m in self._maps)
-
-    @cached_property
-    def name(self):
-        """User-defined labels"""
-        return tuple(m.name for m in self._maps)
-
-    @cached_property
-    def offset(self):
-        """Vertical offsets."""
-        return tuple(0 if m is None else m.offset for m in self._maps)
-
-    def __iter__(self):
-        r"""Yield all :class:`Map`\s when iterated over."""
-        for m in self._maps:
-            yield m
-
-    def __len__(self):
-        r"""Number of contained :class:`Map`\s."""
-        return len(self._maps)
-
-    def __le__(self, o):
-        """self<=o if o equals self or its self._parent==o."""
-        return self == o or all(m <= om for m, om in zip(self, o))
-
-    def __str__(self):
-        return "OP2 MixedMap composed of Maps: %s" % (self._maps,)
-
-    def __repr__(self):
-        return "MixedMap(%r)" % (self._maps,)
-
-
-class Sparsity(ObjectCached):
-
-    """OP2 Sparsity, the non-zero structure a matrix derived from the union of
-    the outer product of pairs of :class:`Map` objects.
-
-    Examples of constructing a Sparsity: ::
-
-        Sparsity(single_dset, single_map, 'mass')
-        Sparsity((row_dset, col_dset), (single_rowmap, single_colmap))
-        Sparsity((row_dset, col_dset),
-                 [(first_rowmap, first_colmap), (second_rowmap, second_colmap)])
-
-    .. _MatMPIAIJSetPreallocation: http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Mat/MatMPIAIJSetPreallocation.html
-    """
-
-    def __init__(self, dsets, maps, *, iteration_regions=None, name=None, nest=None, block_sparse=None):
-        r"""
-        :param dsets: :class:`DataSet`\s for the left and right function
-            spaces this :class:`Sparsity` maps between
-        :param maps: :class:`Map`\s to build the :class:`Sparsity` from
-        :type maps: a pair of :class:`Map`\s specifying a row map and a column
-            map, or an iterable of pairs of :class:`Map`\s specifying multiple
-            row and column maps - if a single :class:`Map` is passed, it is
-            used as both a row map and a column map
-        :param iteration_regions: regions that select subsets of extruded maps to iterate over.
-        :param string name: user-defined label (optional)
-        :param nest: Should the sparsity over mixed set be built as nested blocks?
-        :param block_sparse: Should the sparsity for datasets with
-            cdim > 1 be built as a block sparsity?
-        """
-        # Protect against re-initialization when retrieved from cache
-        if self._initialized:
-            return
-
-        self._block_sparse = block_sparse
-        # Split into a list of row maps and a list of column maps
-        maps, iteration_regions = zip(*maps)
-        self._rmaps, self._cmaps = zip(*maps)
-        self._dsets = dsets
-
-        if isinstance(dsets[0], GlobalDataSet) or isinstance(dsets[1], GlobalDataSet):
-            self._dims = (((1, 1),),)
-            self._d_nnz = None
-            self._o_nnz = None
-            self._nrows = None if isinstance(dsets[0], GlobalDataSet) else self._rmaps[0].toset.size
-            self._ncols = None if isinstance(dsets[1], GlobalDataSet) else self._cmaps[0].toset.size
-            self.lcomm = dsets[0].comm if isinstance(dsets[0], GlobalDataSet) else self._rmaps[0].comm
-            self.rcomm = dsets[1].comm if isinstance(dsets[1], GlobalDataSet) else self._cmaps[0].comm
-        else:
-            self.lcomm = self._rmaps[0].comm
-            self.rcomm = self._cmaps[0].comm
-
-            rset, cset = self.dsets
-            # All rmaps and cmaps have the same data set - just use the first.
-            self._nrows = rset.size
-            self._ncols = cset.size
-
-            self._has_diagonal = (rset == cset)
-
-            tmp = itertools.product([x.cdim for x in self._dsets[0]],
-                                    [x.cdim for x in self._dsets[1]])
-
-            dims = [[None for _ in range(self.shape[1])] for _ in range(self.shape[0])]
-            for r in range(self.shape[0]):
-                for c in range(self.shape[1]):
-                    dims[r][c] = next(tmp)
-
-            self._dims = tuple(tuple(d) for d in dims)
-
-        if self.lcomm != self.rcomm:
-            raise ValueError("Haven't thought hard enough about different left and right communicators")
-        self.comm = self.lcomm
-
-        self._name = name or "sparsity_#x%x" % id(self)
-
-        self.iteration_regions = iteration_regions
-        # If the Sparsity is defined on MixedDataSets, we need to build each
-        # block separately
-        if (isinstance(dsets[0], MixedDataSet) or isinstance(dsets[1], MixedDataSet)) \
-           and nest:
-            self._nested = True
-            self._blocks = []
-            for i, rds in enumerate(dsets[0]):
-                row = []
-                for j, cds in enumerate(dsets[1]):
-                    row.append(Sparsity((rds, cds), [(rm.split[i], cm.split[j]) for
-                                                     rm, cm in maps],
-                                        iteration_regions=iteration_regions,
-                                        block_sparse=block_sparse))
-                self._blocks.append(row)
-            self._d_nnz = tuple(s._d_nnz for s in self)
-            self._o_nnz = tuple(s._o_nnz for s in self)
-        elif isinstance(dsets[0], GlobalDataSet) or isinstance(dsets[1], GlobalDataSet):
-            # Where the sparsity maps either from or to a Global, we
-            # don't really have any sparsity structure.
-            self._blocks = [[self]]
-            self._nested = False
-        else:
-            for dset in dsets:
-                if isinstance(dset, MixedDataSet) and any([isinstance(d, GlobalDataSet) for d in dset]):
-                    raise SparsityFormatError("Mixed monolithic matrices with Global rows or columns are not supported.")
-            self._nested = False
-            with timed_region("CreateSparsity"):
-                nnz, onnz = build_sparsity(self)
-                self._d_nnz = nnz
-                self._o_nnz = onnz
-            self._blocks = [[self]]
-        self._initialized = True
-
-    _cache = {}
-
-    @classmethod
-    @validate_type(('dsets', (Set, DataSet, tuple, list), DataSetTypeError),
-                   ('maps', (Map, tuple, list), MapTypeError))
-    def _process_args(cls, dsets, maps, *, iteration_regions=None, name=None, nest=None, block_sparse=None):
-        "Turn maps argument into a canonical tuple of pairs."
-
-        # A single data set becomes a pair of identical data sets
-        dsets = [dsets, dsets] if isinstance(dsets, (Set, DataSet)) else list(dsets)
-        # Upcast Sets to DataSets
-        dsets = [s ** 1 if isinstance(s, Set) else s for s in dsets]
-
-        # Check data sets are valid
-        for dset in dsets:
-            if not isinstance(dset, DataSet) and dset is not None:
-                raise DataSetTypeError("All data sets must be of type DataSet, not type %r" % type(dset))
-
-        # A single map becomes a pair of identical maps
-        maps = (maps, maps) if isinstance(maps, Map) else maps
-        # A single pair becomes a tuple of one pair
-        maps = (maps,) if isinstance(maps[0], Map) else maps
-
-        # Check maps are sane
-        for pair in maps:
-            if pair[0] is None or pair[1] is None:
-                # None of this checking makes sense if one of the
-                # matrix operands is a Global.
-                continue
-            for m in pair:
-                if not isinstance(m, Map):
-                    raise MapTypeError(
-                        "All maps must be of type map, not type %r" % type(m))
-                if len(m.values_with_halo) == 0 and m.iterset.total_size > 0:
-                    raise MapValueError(
-                        "Unpopulated map values when trying to build sparsity.")
-            # Make sure that the "to" Set of each map in a pair is the set of
-            # the corresponding DataSet set
-            if not (pair[0].toset == dsets[0].set
-                    and pair[1].toset == dsets[1].set):
-                raise RuntimeError("Map to set must be the same as corresponding DataSet set")
-
-            # Each pair of maps must have the same from-set (iteration set)
-            if not pair[0].iterset == pair[1].iterset:
-                raise RuntimeError("Iterset of both maps in a pair must be the same")
-
-        rmaps, cmaps = zip(*maps)
-        if iteration_regions is None:
-            iteration_regions = tuple((ALL, ) for _ in maps)
-        else:
-            iteration_regions = tuple(tuple(sorted(region)) for region in iteration_regions)
-        if not len(rmaps) == len(cmaps):
-            raise RuntimeError("Must pass equal number of row and column maps")
-
-        if rmaps[0] is not None and cmaps[0] is not None:
-            # Each row map must have the same to-set (data set)
-            if not all(m.toset == rmaps[0].toset for m in rmaps):
-                raise RuntimeError("To set of all row maps must be the same")
-
-                # Each column map must have the same to-set (data set)
-            if not all(m.toset == cmaps[0].toset for m in cmaps):
-                raise RuntimeError("To set of all column maps must be the same")
-
-        # Need to return the caching object, a tuple of the processed
-        # arguments and a dict of kwargs (empty in this case)
-        if isinstance(dsets[0], GlobalDataSet):
-            cache = None
-        elif isinstance(dsets[0].set, MixedSet):
-            cache = dsets[0].set[0]
-        else:
-            cache = dsets[0].set
-        if nest is None:
-            nest = configuration["matnest"]
-        if block_sparse is None:
-            block_sparse = configuration["block_sparsity"]
-
-        maps = frozenset(zip(maps, iteration_regions))
-        kwargs = {"name": name,
-                  "nest": nest,
-                  "block_sparse": block_sparse}
-        return (cache,) + (tuple(dsets), maps), kwargs
-
-    @classmethod
-    def _cache_key(cls, dsets, maps, name, nest, block_sparse, *args, **kwargs):
-        return (dsets, maps, nest, block_sparse)
-
-    def __getitem__(self, idx):
-        """Return :class:`Sparsity` block with row and column given by ``idx``
-        or a given row of blocks."""
-        try:
-            i, j = idx
-            return self._blocks[i][j]
-        except TypeError:
-            return self._blocks[idx]
-
-    @cached_property
-    def dsets(self):
-        r"""A pair of :class:`DataSet`\s for the left and right function
-        spaces this :class:`Sparsity` maps between."""
-        return self._dsets
-
-    @cached_property
-    def maps(self):
-        """A list of pairs (rmap, cmap) where each pair of
-        :class:`Map` objects will later be used to assemble into this
-        matrix. The iterset of each of the maps in a pair must be the
-        same, while the toset of all the maps which appear first
-        must be common, this will form the row :class:`Set` of the
-        sparsity. Similarly, the toset of all the maps which appear
-        second must be common and will form the column :class:`Set` of
-        the ``Sparsity``."""
-        return list(zip(self._rmaps, self._cmaps))
-
-    @cached_property
-    def cmaps(self):
-        """The list of column maps this sparsity is assembled from."""
-        return self._cmaps
-
-    @cached_property
-    def rmaps(self):
-        """The list of row maps this sparsity is assembled from."""
-        return self._rmaps
-
-    @cached_property
-    def dims(self):
-        """A tuple of tuples where the ``i,j``th entry
-        is a pair giving the number of rows per entry of the row
-        :class:`Set` and the number of columns per entry of the column
-        :class:`Set` of the ``Sparsity``.  The extents of the first
-        two indices are given by the :attr:`shape` of the sparsity.
-        """
-        return self._dims
-
-    @cached_property
-    def shape(self):
-        """Number of block rows and columns."""
-        return (len(self._dsets[0] or [1]),
-                len(self._dsets[1] or [1]))
-
-    @cached_property
-    def nrows(self):
-        """The number of rows in the ``Sparsity``."""
-        return self._nrows
-
-    @cached_property
-    def ncols(self):
-        """The number of columns in the ``Sparsity``."""
-        return self._ncols
-
-    @cached_property
-    def nested(self):
-        r"""Whether a sparsity is monolithic (even if it has a block structure).
-
-        To elaborate, if a sparsity maps between
-        :class:`MixedDataSet`\s, it can either be nested, in which
-        case it consists of as many blocks are the product of the
-        length of the datasets it maps between, or monolithic.  In the
-        latter case the sparsity is for the full map between the mixed
-        datasets, rather than between the blocks of the non-mixed
-        datasets underneath them.
-        """
-        return self._nested
-
-    @cached_property
-    def name(self):
-        """A user-defined label."""
-        return self._name
-
-    def __iter__(self):
-        r"""Iterate over all :class:`Sparsity`\s by row and then by column."""
-        for row in self._blocks:
-            for s in row:
-                yield s
-
-    def __str__(self):
-        return "OP2 Sparsity: dsets %s, rmaps %s, cmaps %s, name %s" % \
-               (self._dsets, self._rmaps, self._cmaps, self._name)
-
-    def __repr__(self):
-        return "Sparsity(%r, %r, %r)" % (self.dsets, self.maps, self.name)
-
-    @cached_property
-    def nnz(self):
-        """Array containing the number of non-zeroes in the various rows of the
-        diagonal portion of the local submatrix.
-
-        This is the same as the parameter `d_nnz` used for preallocation in
-        PETSc's MatMPIAIJSetPreallocation_."""
-        return self._d_nnz
-
-    @cached_property
-    def onnz(self):
-        """Array containing the number of non-zeroes in the various rows of the
-        off-diagonal portion of the local submatrix.
-
-        This is the same as the parameter `o_nnz` used for preallocation in
-        PETSc's MatMPIAIJSetPreallocation_."""
-        return self._o_nnz
-
-    @cached_property
-    def nz(self):
-        return self._d_nnz.sum()
-
-    @cached_property
-    def onz(self):
-        return self._o_nnz.sum()
-
-    def __contains__(self, other):
-        """Return true if other is a pair of maps in self.maps(). This
-        will also return true if the elements of other have parents in
-        self.maps()."""
-
-        for maps in self.maps:
-            if tuple(other) <= maps:
-                return True
-
-        return False
-
-
-class Mat(DataCarrier):
-    r"""OP2 matrix data. A ``Mat`` is defined on a sparsity pattern and holds a value
-    for each element in the :class:`Sparsity`.
-
-    When a ``Mat`` is passed to :func:`pyop2.op2.par_loop`, the maps via which
-    indirection occurs for the row and column space, and the access
-    descriptor are passed by `calling` the ``Mat``. For instance, if a
-    ``Mat`` named ``A`` is to be accessed for reading via a row :class:`Map`
-    named ``R`` and a column :class:`Map` named ``C``, this is accomplished by::
-
-     A(pyop2.READ, (R[pyop2.i[0]], C[pyop2.i[1]]))
-
-    Notice that it is `always` necessary to index the indirection maps
-    for a ``Mat``. See the :class:`Mat` documentation for more
-    details.
-
-    .. note ::
-
-       After executing :func:`par_loop`\s that write to a ``Mat`` and
-       before using it (for example to view its values), you must call
-       :meth:`assemble` to finalise the writes.
-    """
-    @cached_property
-    def pack(self):
-        from pyop2.codegen.builder import MatPack
-        return MatPack
-
-    ASSEMBLED = "ASSEMBLED"
-    INSERT_VALUES = "INSERT_VALUES"
-    ADD_VALUES = "ADD_VALUES"
-
-    _modes = [WRITE, INC]
-
-    @validate_type(('sparsity', Sparsity, SparsityTypeError),
-                   ('name', str, NameTypeError))
-    def __init__(self, sparsity, dtype=None, name=None):
-        self._sparsity = sparsity
-        self.lcomm = sparsity.lcomm
-        self.rcomm = sparsity.rcomm
-        self.comm = sparsity.comm
-        dtype = dtype or ScalarType
-        self._datatype = np.dtype(dtype)
-        self._name = name or "mat_#x%x" % id(self)
-        self.assembly_state = Mat.ASSEMBLED
-
-    @validate_in(('access', _modes, ModeValueError))
-    def __call__(self, access, path, lgmaps=None, unroll_map=False):
-        path_maps = as_tuple(path, Map, 2)
-        if configuration["type_check"] and tuple(path_maps) not in self.sparsity:
-            raise MapValueError("Path maps not in sparsity maps")
-        return _make_object('Arg', data=self, map=path_maps, access=access, lgmaps=lgmaps, unroll_map=unroll_map)
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self), self.dtype, self.dims)
-
-    def assemble(self):
-        """Finalise this :class:`Mat` ready for use.
-
-        Call this /after/ executing all the par_loops that write to
-        the matrix before you want to look at it.
-        """
-        raise NotImplementedError("Subclass should implement this")
-
-    def addto_values(self, rows, cols, values):
-        """Add a block of values to the :class:`Mat`."""
-        raise NotImplementedError(
-            "Abstract Mat base class doesn't know how to set values.")
-
-    def set_values(self, rows, cols, values):
-        """Set a block of values in the :class:`Mat`."""
-        raise NotImplementedError(
-            "Abstract Mat base class doesn't know how to set values.")
-
-    @cached_property
-    def nblocks(self):
-        return int(np.prod(self.sparsity.shape))
-
-    @cached_property
-    def _argtypes_(self):
-        """Ctypes argtype for this :class:`Mat`"""
-        return tuple(ctypes.c_voidp for _ in self)
-
-    @cached_property
-    def dims(self):
-        """A pair of integers giving the number of matrix rows and columns for
-        each member of the row :class:`Set` and column :class:`Set`
-        respectively. This corresponds to the ``cdim`` member of a
-        :class:`DataSet`."""
-        return self._sparsity._dims
-
-    @cached_property
-    def nrows(self):
-        "The number of rows in the matrix (local to this process)"
-        return sum(d.size * d.cdim for d in self.sparsity.dsets[0])
-
-    @cached_property
-    def nblock_rows(self):
-        """The number "block" rows in the matrix (local to this process).
-
-        This is equivalent to the number of rows in the matrix divided
-        by the dimension of the row :class:`DataSet`.
-        """
-        assert len(self.sparsity.dsets[0]) == 1, "Block rows don't make sense for mixed Mats"
-        return self.sparsity.dsets[0].size
-
-    @cached_property
-    def nblock_cols(self):
-        """The number of "block" columns in the matrix (local to this process).
-
-        This is equivalent to the number of columns in the matrix
-        divided by the dimension of the column :class:`DataSet`.
-        """
-        assert len(self.sparsity.dsets[1]) == 1, "Block cols don't make sense for mixed Mats"
-        return self.sparsity.dsets[1].size
-
-    @cached_property
-    def ncols(self):
-        "The number of columns in the matrix (local to this process)"
-        return sum(d.size * d.cdim for d in self.sparsity.dsets[1])
-
-    @cached_property
-    def sparsity(self):
-        """:class:`Sparsity` on which the ``Mat`` is defined."""
-        return self._sparsity
-
-    @cached_property
-    def _is_scalar_field(self):
-        # Sparsity from Dat to MixedDat has a shape like (1, (1, 1))
-        # (which you can't take the product of)
-        return all(np.prod(d) == 1 for d in self.dims)
-
-    @cached_property
-    def _is_vector_field(self):
-        return not self._is_scalar_field
-
-    def change_assembly_state(self, new_state):
-        """Switch the matrix assembly state."""
-        if new_state == Mat.ASSEMBLED or self.assembly_state == Mat.ASSEMBLED:
-            self.assembly_state = new_state
-        elif new_state != self.assembly_state:
-            self._flush_assembly()
-            self.assembly_state = new_state
-        else:
-            pass
-
-    def _flush_assembly(self):
-        """Flush the in flight assembly operations (used when
-        switching between inserting and adding values)."""
-        pass
-
-    @property
-    def values(self):
-        """A numpy array of matrix values.
-
-        .. warning ::
-            This is a dense array, so will need a lot of memory.  It's
-            probably not a good idea to access this property if your
-            matrix has more than around 10000 degrees of freedom.
-        """
-        raise NotImplementedError("Abstract base Mat does not implement values()")
-
-    @cached_property
-    def dtype(self):
-        """The Python type of the data."""
-        return self._datatype
-
-    @cached_property
-    def nbytes(self):
-        """Return an estimate of the size of the data associated with this
-        :class:`Mat` in bytes. This will be the correct size of the
-        data payload, but does not take into account the (presumably
-        small) overhead of the object and its metadata. The memory
-        associated with the sparsity pattern is also not recorded.
-
-        Note that this is the process local memory usage, not the sum
-        over all MPI processes.
-        """
-        if self._sparsity._block_sparse:
-            mult = np.sum(np.prod(self._sparsity.dims))
-        else:
-            mult = 1
-        return (self._sparsity.nz + self._sparsity.onz) \
-            * self.dtype.itemsize * mult
-
-    def __iter__(self):
-        """Yield self when iterated over."""
-        yield self
-
-    def __mul__(self, other):
-        """Multiply this :class:`Mat` with the vector ``other``."""
-        raise NotImplementedError("Abstract base Mat does not implement multiplication")
-
-    def __str__(self):
-        return "OP2 Mat: %s, sparsity (%s), datatype %s" \
-               % (self._name, self._sparsity, self._datatype.name)
-
-    def __repr__(self):
-        return "Mat(%r, %r, %r)" \
-               % (self._sparsity, self._datatype, self._name)
-
-# Kernel API
-
-
-class Kernel(Cached):
-
-    """OP2 kernel type.
-
-    :param code: kernel function definition, including signature; either a
-        string or an AST :class:`.Node`
-    :param name: kernel function name; must match the name of the kernel
-        function given in `code`
-    :param opts: options dictionary for :doc:`PyOP2 IR optimisations <ir>`
-        (optional, ignored if `code` is a string)
-    :param include_dirs: list of additional include directories to be searched
-        when compiling the kernel (optional, defaults to empty)
-    :param headers: list of system headers to include when compiling the kernel
-        in the form ``#include <header.h>`` (optional, defaults to empty)
-    :param user_code: code snippet to be executed once at the very start of
-        the generated kernel wrapper code (optional, defaults to
-        empty)
-    :param ldargs: A list of arguments to pass to the linker when
-        compiling this Kernel.
-    :param requires_zeroed_output_arguments: Does this kernel require the
-        output arguments to be zeroed on entry when called? (default no)
-    :param cpp: Is the kernel actually C++ rather than C?  If yes,
-        then compile with the C++ compiler (kernel is wrapped in
-        extern C for linkage reasons).
-
-    Consider the case of initialising a :class:`~pyop2.Dat` with seeded random
-    values in the interval 0 to 1. The corresponding :class:`~pyop2.Kernel` is
-    constructed as follows: ::
-
-      op2.Kernel("void setrand(double *x) { x[0] = (double)random()/RAND_MAX); }",
-                 name="setrand",
-                 headers=["#include <stdlib.h>"], user_code="srandom(10001);")
-
-    .. note::
-        When running in parallel with MPI the generated code must be the same
-        on all ranks.
-    """
-
-    _cache = {}
-
-    @classmethod
-    @validate_type(('name', str, NameTypeError))
-    def _cache_key(cls, code, name, opts={}, include_dirs=[], headers=[],
-                   user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
-                   flop_count=None):
-        # Both code and name are relevant since there might be multiple kernels
-        # extracting different functions from the same code
-        # Also include the PyOP2 version, since the Kernel class might change
-
-        if isinstance(code, Node):
-            code = code.gencode()
-        if isinstance(code, loopy.TranslationUnit):
-            from loopy.tools import LoopyKeyBuilder
-            from hashlib import sha256
-            key_hash = sha256()
-            code.update_persistent_hash(key_hash, LoopyKeyBuilder())
-            code = key_hash.hexdigest()
-        hashee = (str(code) + name + str(sorted(opts.items())) + str(include_dirs)
-                  + str(headers) + version + str(ldargs) + str(cpp) + str(requires_zeroed_output_arguments))
-        return md5(hashee.encode()).hexdigest()
-
-    @cached_property
-    def _wrapper_cache_key_(self):
-        return (self._key, )
-
-    def __init__(self, code, name, opts={}, include_dirs=[], headers=[],
-                 user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
-                 flop_count=None):
-        # Protect against re-initialization when retrieved from cache
-        if self._initialized:
-            return
-        self._name = name
-        self._cpp = cpp
-        # Record used optimisations
-        self._opts = opts
-        self._include_dirs = include_dirs
-        self._ldargs = ldargs if ldargs is not None else []
-        self._headers = headers
-        self._user_code = user_code
-        assert isinstance(code, (str, Node, loopy.Program, loopy.LoopKernel, loopy.TranslationUnit))
-        self._code = code
-        self._initialized = True
-        self.requires_zeroed_output_arguments = requires_zeroed_output_arguments
-        self.flop_count = flop_count
-
-    @property
-    def name(self):
-        """Kernel name, must match the kernel function name in the code."""
-        return self._name
-
-    @property
-    def code(self):
-        return self._code
-
-    @cached_property
-    def num_flops(self):
-        if self.flop_count is not None:
-            return self.flop_count
-        if not configuration["compute_kernel_flops"]:
-            return 0
-        if isinstance(self.code, Node):
-            v = EstimateFlops()
-            return v.visit(self.code)
-        elif isinstance(self.code, loopy.TranslationUnit):
-            op_map = loopy.get_op_map(
-                self.code.copy(options=loopy.Options(ignore_boostable_into=True),
-                               silenced_warnings=['insn_count_subgroups_upper_bound',
-                                                  'get_x_map_guessing_subgroup_size',
-                                                  'summing_if_branches_ops']),
-                subgroup_size='guess')
-            return op_map.filter_by(name=['add', 'sub', 'mul', 'div'], dtype=[ScalarType]).eval_and_sum({})
-        else:
-            return 0
-
-    def __str__(self):
-        return "OP2 Kernel: %s" % self._name
-
-    def __repr__(self):
-        return 'Kernel("""%s""", %r)' % (self._code, self._name)
-
-    def __eq__(self, other):
-        return self.cache_key == other.cache_key
-
-
-class JITModule(Cached):
-
-    """Cached module encapsulating the generated :class:`ParLoop` stub.
-
-    .. warning::
-
-       Note to implementors.  This object is *cached* and therefore
-       should not hold any references to objects you might want to be
-       collected (such PyOP2 data objects)."""
-
-    _cache = {}
-
-    @classmethod
-    def _cache_key(cls, kernel, iterset, *args, **kwargs):
-        counter = itertools.count()
-        seen = defaultdict(lambda: next(counter))
-        key = ((id(dup_comm(iterset.comm)), ) + kernel._wrapper_cache_key_ + iterset._wrapper_cache_key_
-               + (iterset._extruded, (iterset._extruded and iterset.constant_layers), isinstance(iterset, Subset)))
-
-        for arg in args:
-            key += arg._wrapper_cache_key_
-            for map_ in arg.map_tuple:
-                key += (seen[map_],)
-
-        key += (kwargs.get("iterate", None), cls, configuration["simd_width"])
-
-        return key
-
-
-class IterationRegion(IntEnum):
-    BOTTOM = 1
-    TOP = 2
-    INTERIOR_FACETS = 3
-    ALL = 4
-
-
-ON_BOTTOM = IterationRegion.BOTTOM
-"""Iterate over the cells at the bottom of the column in an extruded mesh."""
-
-ON_TOP = IterationRegion.TOP
-"""Iterate over the top cells in an extruded mesh."""
-
-ON_INTERIOR_FACETS = IterationRegion.INTERIOR_FACETS
-"""Iterate over the interior facets of an extruded mesh."""
-
-ALL = IterationRegion.ALL
-"""Iterate over all cells of an extruded mesh."""
-
-
-class ParLoop(object):
-    """Represents the kernel, iteration space and arguments of a parallel loop
-    invocation.
-
-    .. note ::
-
-        Users should not directly construct :class:`ParLoop` objects, but
-        use :func:`pyop2.op2.par_loop` instead.
-
-    An optional keyword argument, ``iterate``, can be used to specify
-    which region of an :class:`ExtrudedSet` the parallel loop should
-    iterate over.
-    """
-
-    @validate_type(('kernel', Kernel, KernelTypeError),
-                   ('iterset', Set, SetTypeError))
-    def __init__(self, kernel, iterset, *args, **kwargs):
-        # INCs into globals need to start with zero and then sum back
-        # into the input global at the end.  This has the same number
-        # of reductions but means that successive par_loops
-        # incrementing into a global get the "right" value in
-        # parallel.
-        # Don't care about MIN and MAX because they commute with the reduction
-        self._reduced_globals = {}
-        for i, arg in enumerate(args):
-            if arg._is_global_reduction and arg.access == INC:
-                glob = arg.data
-                tmp = _make_object('Global', glob.dim, data=np.zeros_like(glob.data_ro), dtype=glob.dtype)
-                self._reduced_globals[tmp] = glob
-                args[i].data = tmp
-
-        # Always use the current arguments, also when we hit cache
-        self._actual_args = args
-        self._kernel = kernel
-        self._is_layered = iterset._extruded
-        self._iteration_region = kwargs.get("iterate", None)
-        self._pass_layer_arg = kwargs.get("pass_layer_arg", False)
-
-        check_iterset(self.args, iterset)
-
-        if self._pass_layer_arg:
-            if not self._is_layered:
-                raise ValueError("Can't request layer arg for non-extruded iteration")
-
-        self.iterset = iterset
-        self.comm = iterset.comm
-
-        for i, arg in enumerate(self._actual_args):
-            arg.position = i
-            arg.indirect_position = i
-        for i, arg1 in enumerate(self._actual_args):
-            if arg1._is_dat and arg1._is_indirect:
-                for arg2 in self._actual_args[i:]:
-                    # We have to check for identity here (we really
-                    # want these to be the same thing, not just look
-                    # the same)
-                    if arg2.data is arg1.data and arg2.map is arg1.map:
-                        arg2.indirect_position = arg1.indirect_position
-
-        self.arglist = self.prepare_arglist(iterset, *self.args)
-
-    def prepare_arglist(self, iterset, *args):
-        """Prepare the argument list for calling generated code.
-
-        :arg iterset: The :class:`Set` iterated over.
-        :arg args: A list of :class:`Args`, the argument to the :fn:`par_loop`.
-        """
-        return ()
-
-    @cached_property
-    def num_flops(self):
-        iterset = self.iterset
-        size = 1
-        if iterset._extruded:
-            region = self.iteration_region
-            layers = np.mean(iterset.layers_array[:, 1] - iterset.layers_array[:, 0])
-            if region is ON_INTERIOR_FACETS:
-                size = layers - 2
-            elif region not in [ON_TOP, ON_BOTTOM]:
-                size = layers - 1
-        return size * self._kernel.num_flops
-
-    def log_flops(self, flops):
-        pass
-
-    @property
-    @collective
-    def _jitmodule(self):
-        """Return the :class:`JITModule` that encapsulates the compiled par_loop code.
-
-        Return None if the child class should deal with this in another way."""
-        return None
-
-    @cached_property
-    def _parloop_event(self):
-        return timed_region("ParLoopExecute")
-
-    @collective
-    def compute(self):
-        """Executes the kernel over all members of the iteration space."""
-        with self._parloop_event:
-            orig_lgmaps = []
-            for arg in self.args:
-                if arg._is_mat:
-                    new_state = {INC: Mat.ADD_VALUES,
-                                 WRITE: Mat.INSERT_VALUES}[arg.access]
-                    for m in arg.data:
-                        m.change_assembly_state(new_state)
-                    arg.data.change_assembly_state(new_state)
-                    # Boundary conditions applied to the matrix appear
-                    # as modified lgmaps on the Arg. We set them onto
-                    # the matrix so things are correctly dropped in
-                    # insertion, and then restore the original lgmaps
-                    # afterwards.
-                    if arg.lgmaps is not None:
-                        olgmaps = []
-                        for m, lgmaps in zip(arg.data, arg.lgmaps):
-                            olgmaps.append(m.handle.getLGMap())
-                            m.handle.setLGMap(*lgmaps)
-                        orig_lgmaps.append(olgmaps)
-            self.global_to_local_begin()
-            iterset = self.iterset
-            arglist = self.arglist
-            fun = self._jitmodule
-            # Need to ensure INC globals are zero on entry to the loop
-            # in case it's reused.
-            for g in self._reduced_globals.keys():
-                g._data[...] = 0
-            self._compute(iterset.core_part, fun, *arglist)
-            self.global_to_local_end()
-            self._compute(iterset.owned_part, fun, *arglist)
-            self.reduction_begin()
-            self.local_to_global_begin()
-            self.update_arg_data_state()
-            for arg in reversed(self.args):
-                if arg._is_mat and arg.lgmaps is not None:
-                    for m, lgmaps in zip(arg.data, orig_lgmaps.pop()):
-                        m.handle.setLGMap(*lgmaps)
-            self.reduction_end()
-            self.local_to_global_end()
-
-    @collective
-    def _compute(self, part, fun, *arglist):
-        """Executes the kernel over all members of a MPI-part of the iteration space.
-
-        :arg part: The :class:`SetPartition` to compute over
-        :arg fun: The :class:`JITModule` encapsulating the compiled
-             code (may be ignored by the backend).
-        :arg arglist: The arguments to pass to the compiled code (may
-             be ignored by the backend, depending on the exact implementation)"""
-        raise RuntimeError("Must select a backend")
-
-    @collective
-    def global_to_local_begin(self):
-        """Start halo exchanges."""
-        for arg in self.unique_dat_args:
-            arg.global_to_local_begin()
-
-    @collective
-    def global_to_local_end(self):
-        """Finish halo exchanges"""
-        for arg in self.unique_dat_args:
-            arg.global_to_local_end()
-
-    @collective
-    def local_to_global_begin(self):
-        """Start halo exchanges."""
-        for arg in self.unique_dat_args:
-            arg.local_to_global_begin()
-
-    @collective
-    def local_to_global_end(self):
-        """Finish halo exchanges (wait on irecvs)"""
-        for arg in self.unique_dat_args:
-            arg.local_to_global_end()
-
-    @cached_property
-    def _reduction_event_begin(self):
-        return timed_region("ParLoopRednBegin")
-
-    @cached_property
-    def _reduction_event_end(self):
-        return timed_region("ParLoopRednEnd")
-
-    @cached_property
-    def _has_reduction(self):
-        return len(self.global_reduction_args) > 0
-
-    @collective
-    def reduction_begin(self):
-        """Start reductions"""
-        if not self._has_reduction:
-            return
-        with self._reduction_event_begin:
-            for arg in self.global_reduction_args:
-                arg.reduction_begin(self.comm)
-
-    @collective
-    def reduction_end(self):
-        """End reductions"""
-        if not self._has_reduction:
-            return
-        with self._reduction_event_end:
-            for arg in self.global_reduction_args:
-                arg.reduction_end(self.comm)
-            # Finalise global increments
-            for tmp, glob in self._reduced_globals.items():
-                glob._data += tmp._data
-
-    @collective
-    def update_arg_data_state(self):
-        r"""Update the state of the :class:`DataCarrier`\s in the arguments to the `par_loop`.
-
-        This marks :class:`Mat`\s that need assembly."""
-        for arg in self.args:
-            access = arg.access
-            if access is READ:
-                continue
-            if arg._is_dat:
-                arg.data.halo_valid = False
-            if arg._is_mat:
-                state = {WRITE: Mat.INSERT_VALUES,
-                         INC: Mat.ADD_VALUES}[access]
-                arg.data.assembly_state = state
-
-    @cached_property
-    def dat_args(self):
-        return tuple(arg for arg in self.args if arg._is_dat)
-
-    @cached_property
-    def unique_dat_args(self):
-        seen = {}
-        unique = []
-        for arg in self.dat_args:
-            if arg.data not in seen:
-                unique.append(arg)
-                seen[arg.data] = arg
-            elif arg.access != seen[arg.data].access:
-                raise ValueError("Same Dat appears multiple times with different "
-                                 "access descriptors")
-        return tuple(unique)
-
-    @cached_property
-    def global_reduction_args(self):
-        return tuple(arg for arg in self.args if arg._is_global_reduction)
-
-    @cached_property
-    def kernel(self):
-        """Kernel executed by this parallel loop."""
-        return self._kernel
-
-    @cached_property
-    def args(self):
-        """Arguments to this parallel loop."""
-        return self._actual_args
-
-    @cached_property
-    def is_layered(self):
-        """Flag which triggers extrusion"""
-        return self._is_layered
-
-    @cached_property
-    def iteration_region(self):
-        """Specifies the part of the mesh the parallel loop will
-        be iterating over. The effect is the loop only iterates over
-        a certain part of an extruded mesh, for example on top cells, bottom cells or
-        interior facets."""
-        return self._iteration_region
-
-
-def check_iterset(args, iterset):
-    """Checks that the iteration set of the :class:`ParLoop` matches the
-    iteration set of all its arguments. A :class:`MapValueError` is raised
-    if this condition is not met."""
-
-    if isinstance(iterset, Subset):
-        _iterset = iterset.superset
-    else:
-        _iterset = iterset
-    if configuration["type_check"]:
-        if isinstance(_iterset, MixedSet):
-            raise SetTypeError("Cannot iterate over MixedSets")
-        for i, arg in enumerate(args):
-            if arg._is_global:
-                continue
-            if arg._is_direct:
-                if isinstance(_iterset, ExtrudedSet):
-                    if arg.data.dataset.set != _iterset.parent:
-                        raise MapValueError(
-                            "Iterset of direct arg %s doesn't match ParLoop iterset." % i)
-                elif arg.data.dataset.set != _iterset:
-                    raise MapValueError(
-                        "Iterset of direct arg %s doesn't match ParLoop iterset." % i)
-                continue
-            for j, m in enumerate(arg._map):
-                if isinstance(_iterset, ExtrudedSet):
-                    if m.iterset != _iterset and m.iterset not in _iterset:
-                        raise MapValueError(
-                            "Iterset of arg %s map %s doesn't match ParLoop iterset." % (i, j))
-                elif m.iterset != _iterset and m.iterset not in _iterset:
-                    raise MapValueError(
-                        "Iterset of arg %s map %s doesn't match ParLoop iterset." % (i, j))
-
-
-@collective
-def par_loop(kernel, iterset, *args, **kwargs):
-    r"""Invocation of an OP2 kernel
-
-    :arg kernel: The :class:`Kernel` to be executed.
-    :arg iterset: The iteration :class:`Set` over which the kernel should be
-                  executed.
-    :arg \*args: One or more :class:`base.Arg`\s constructed from a
-                 :class:`Global`, :class:`Dat` or :class:`Mat` using the call
-                 syntax and passing in an optionally indexed :class:`Map`
-                 through which this :class:`base.Arg` is accessed and the
-                 :class:`base.Access` descriptor indicating how the
-                 :class:`Kernel` is going to access this data (see the example
-                 below). These are the global data structures from and to
-                 which the kernel will read and write.
-    :kwarg iterate: Optionally specify which region of an
-            :class:`ExtrudedSet` to iterate over.
-            Valid values are:
-
-              - ``ON_BOTTOM``: iterate over the bottom layer of cells.
-              - ``ON_TOP`` iterate over the top layer of cells.
-              - ``ALL`` iterate over all cells (the default if unspecified)
-              - ``ON_INTERIOR_FACETS`` iterate over all the layers
-                 except the top layer, accessing data two adjacent (in
-                 the extruded direction) cells at a time.
-
-    :kwarg pass_layer_arg: Should the wrapper pass the current layer
-        into the kernel (as an ``int``). Only makes sense for
-        indirect extruded iteration.
-
-    .. warning ::
-        It is the caller's responsibility that the number and type of all
-        :class:`base.Arg`\s passed to the :func:`par_loop` match those expected
-        by the :class:`Kernel`. No runtime check is performed to ensure this!
-
-    :func:`par_loop` invocation is illustrated by the following example ::
-
-      pyop2.par_loop(mass, elements,
-                     mat(pyop2.INC, (elem_node[pyop2.i[0]]), elem_node[pyop2.i[1]]),
-                     coords(pyop2.READ, elem_node))
-
-    This example will execute the :class:`Kernel` ``mass`` over the
-    :class:`Set` ``elements`` executing 3x3 times for each
-    :class:`Set` member, assuming the :class:`Map` ``elem_node`` is of arity 3.
-    The :class:`Kernel` takes four arguments, the first is a :class:`Mat` named
-    ``mat``, the second is a field named ``coords``. The remaining two arguments
-    indicate which local iteration space point the kernel is to execute.
-
-    A :class:`Mat` requires a pair of :class:`Map` objects, one each
-    for the row and column spaces. In this case both are the same
-    ``elem_node`` map. The row :class:`Map` is indexed by the first
-    index in the local iteration space, indicated by the ``0`` index
-    to :data:`pyop2.i`, while the column space is indexed by
-    the second local index.  The matrix is accessed to increment
-    values using the ``pyop2.INC`` access descriptor.
-
-    The ``coords`` :class:`Dat` is also accessed via the ``elem_node``
-    :class:`Map`, however no indices are passed so all entries of
-    ``elem_node`` for the relevant member of ``elements`` will be
-    passed to the kernel as a vector.
-    """
-    if isinstance(kernel, types.FunctionType):
-        from pyop2 import pyparloop
-        return pyparloop.ParLoop(kernel, iterset, *args, **kwargs).compute()
-    return _make_object('ParLoop', kernel, iterset, *args, **kwargs).compute()
diff --git a/pyop2/kernel.py b/pyop2/kernel.py
new file mode 100644
index 000000000..a73bbdf73
--- /dev/null
+++ b/pyop2/kernel.py
@@ -0,0 +1,130 @@
+import hashlib
+
+import coffee
+import loopy as lp
+
+from . import caching, configuration as conf, datatypes, exceptions as ex, utils, version
+
+
+class Kernel(caching.Cached):
+
+    """OP2 kernel type.
+
+    :param code: kernel function definition, including signature; either a
+        string or an AST :class:`.Node`
+    :param name: kernel function name; must match the name of the kernel
+        function given in `code`
+    :param opts: options dictionary for :doc:`PyOP2 IR optimisations <ir>`
+        (optional, ignored if `code` is a string)
+    :param include_dirs: list of additional include directories to be searched
+        when compiling the kernel (optional, defaults to empty)
+    :param headers: list of system headers to include when compiling the kernel
+        in the form ``#include <header.h>`` (optional, defaults to empty)
+    :param user_code: code snippet to be executed once at the very start of
+        the generated kernel wrapper code (optional, defaults to
+        empty)
+    :param ldargs: A list of arguments to pass to the linker when
+        compiling this Kernel.
+    :param requires_zeroed_output_arguments: Does this kernel require the
+        output arguments to be zeroed on entry when called? (default no)
+    :param cpp: Is the kernel actually C++ rather than C?  If yes,
+        then compile with the C++ compiler (kernel is wrapped in
+        extern C for linkage reasons).
+
+    Consider the case of initialising a :class:`~pyop2.Dat` with seeded random
+    values in the interval 0 to 1. The corresponding :class:`~pyop2.Kernel` is
+    constructed as follows: ::
+
+      op2.Kernel("void setrand(double *x) { x[0] = (double)random()/RAND_MAX); }",
+                 name="setrand",
+                 headers=["#include <stdlib.h>"], user_code="srandom(10001);")
+
+    .. note::
+        When running in parallel with MPI the generated code must be the same
+        on all ranks.
+    """
+
+    _cache = {}
+
+    @classmethod
+    @utils.validate_type(('name', str, ex.NameTypeError))
+    def _cache_key(cls, code, name, opts={}, include_dirs=[], headers=[],
+                   user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
+                   flop_count=None):
+        # Both code and name are relevant since there might be multiple kernels
+        # extracting different functions from the same code
+        # Also include the PyOP2 version, since the Kernel class might change
+
+        if isinstance(code, coffee.base.Node):
+            code = code.gencode()
+        if isinstance(code, lp.TranslationUnit):
+            from loopy.tools import LoopyKeyBuilder
+            from hashlib import sha256
+            key_hash = sha256()
+            code.update_persistent_hash(key_hash, LoopyKeyBuilder())
+            code = key_hash.hexdigest()
+        hashee = (str(code) + name + str(sorted(opts.items())) + str(include_dirs)
+                  + str(headers) + version.__version__ + str(ldargs) + str(cpp) + str(requires_zeroed_output_arguments))
+        return hashlib.md5(hashee.encode()).hexdigest()
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (self._key, )
+
+    def __init__(self, code, name, opts={}, include_dirs=[], headers=[],
+                 user_code="", ldargs=None, cpp=False, requires_zeroed_output_arguments=False,
+                 flop_count=None):
+        # Protect against re-initialization when retrieved from cache
+        if self._initialized:
+            return
+        self._name = name
+        self._cpp = cpp
+        # Record used optimisations
+        self._opts = opts
+        self._include_dirs = include_dirs
+        self._ldargs = ldargs if ldargs is not None else []
+        self._headers = headers
+        self._user_code = user_code
+        assert isinstance(code, (str, coffee.base.Node, lp.Program, lp.LoopKernel, lp.TranslationUnit))
+        self._code = code
+        self._initialized = True
+        self.requires_zeroed_output_arguments = requires_zeroed_output_arguments
+        self.flop_count = flop_count
+
+    @property
+    def name(self):
+        """Kernel name, must match the kernel function name in the code."""
+        return self._name
+
+    @property
+    def code(self):
+        return self._code
+
+    @utils.cached_property
+    def num_flops(self):
+        if self.flop_count is not None:
+            return self.flop_count
+        if not conf.configuration["compute_kernel_flops"]:
+            return 0
+        if isinstance(self.code, coffee.base.Node):
+            v = coffee.visitors.EstimateFlops()
+            return v.visit(self.code)
+        elif isinstance(self.code, lp.TranslationUnit):
+            op_map = lp.get_op_map(
+                self.code.copy(options=lp.Options(ignore_boostable_into=True),
+                               silenced_warnings=['insn_count_subgroups_upper_bound',
+                                                  'get_x_map_guessing_subgroup_size',
+                                                  'summing_if_branches_ops']),
+                subgroup_size='guess')
+            return op_map.filter_by(name=['add', 'sub', 'mul', 'div'], dtype=[datatypes.ScalarType]).eval_and_sum({})
+        else:
+            return 0
+
+    def __str__(self):
+        return "OP2 Kernel: %s" % self._name
+
+    def __repr__(self):
+        return 'Kernel("""%s""", %r)' % (self._code, self._name)
+
+    def __eq__(self, other):
+        return self.cache_key == other.cache_key
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
new file mode 100644
index 000000000..462ad707c
--- /dev/null
+++ b/pyop2/parloop.py
@@ -0,0 +1,884 @@
+import collections
+import copy
+import ctypes
+import enum
+import itertools
+import os
+import types
+
+import loopy as lp
+import numpy as np
+from petsc4py import PETSc
+
+from . import (
+    caching,
+    compilation,
+    configuration as conf,
+    datatypes as dtypes,
+    exceptions as ex,
+    mpi,
+    profiling,
+    utils
+)
+from .kernel import Kernel
+from .types import (
+    Access,
+    Global, Dat, Mat, Map, MixedDat,
+    Set, MixedSet, ExtrudedSet, Subset
+)
+
+
+class Arg:
+
+    """An argument to a :func:`pyop2.op2.par_loop`.
+
+    .. warning ::
+        User code should not directly instantiate :class:`Arg`.
+        Instead, use the call syntax on the :class:`DataCarrier`.
+    """
+
+    def __init__(self, data=None, map=None, access=None, lgmaps=None, unroll_map=False):
+        """
+        :param data: A data-carrying object, either :class:`Dat` or class:`Mat`
+        :param map:  A :class:`Map` to access this :class:`Arg` or the default
+                     if the identity map is to be used.
+        :param access: An access descriptor of type :class:`Access`
+        :param lgmaps: For :class:`Mat` objects, a tuple of 2-tuples of local to
+            global maps used during assembly.
+
+        Checks that:
+
+        1. the maps used are initialized i.e. have mapping data associated, and
+        2. the to Set of the map used to access it matches the Set it is
+           defined on.
+
+        A :class:`MapValueError` is raised if these conditions are not met."""
+        self.data = data
+        self._map = map
+        if map is None:
+            self.map_tuple = ()
+        elif isinstance(map, Map):
+            self.map_tuple = (map, )
+        else:
+            self.map_tuple = tuple(map)
+
+        if data is not None and hasattr(data, "dtype"):
+            if data.dtype.kind == "c" and (access == Access.MIN or access == Access.MAX):
+                raise ValueError("MIN and MAX access descriptors are undefined on complex data.")
+        self._access = access
+
+        self.unroll_map = unroll_map
+        self.lgmaps = None
+        if self._is_mat and lgmaps is not None:
+            self.lgmaps = utils.as_tuple(lgmaps)
+            assert len(self.lgmaps) == self.data.nblocks
+        else:
+            if lgmaps is not None:
+                raise ValueError("Local to global maps only for matrices")
+
+        # Check arguments for consistency
+        if conf.configuration["type_check"] and not (self._is_global or map is None):
+            for j, m in enumerate(map):
+                if m.iterset.total_size > 0 and len(m.values_with_halo) == 0:
+                    raise ex.MapValueError("%s is not initialized." % map)
+                if self._is_mat and m.toset != data.sparsity.dsets[j].set:
+                    raise ex.MapValueError(
+                        "To set of %s doesn't match the set of %s." % (map, data))
+            if self._is_dat and map.toset != data.dataset.set:
+                raise ex.MapValueError(
+                    "To set of %s doesn't match the set of %s." % (map, data))
+
+    def recreate(self, data=None, map=None, access=None, lgmaps=None, unroll_map=None):
+        """Creates a new Dat based on the existing Dat with the changes specified.
+
+        :param data: A data-carrying object, either :class:`Dat` or class:`Mat`
+        :param map:  A :class:`Map` to access this :class:`Arg` or the default
+                     if the identity map is to be used.
+        :param access: An access descriptor of type :class:`Access`
+        :param lgmaps: For :class:`Mat` objects, a tuple of 2-tuples of local to
+            global maps used during assembly."""
+        return type(self)(data=data or self.data,
+                          map=map or self.map,
+                          access=access or self.access,
+                          lgmaps=lgmaps or self.lgmaps,
+                          unroll_map=False if unroll_map is None else unroll_map)
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return self.data._kernel_args_
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return self.data._argtypes_
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        if self.map is not None:
+            map_ = tuple(None if m is None else m._wrapper_cache_key_ for m in self.map)
+        else:
+            map_ = self.map
+        return (type(self), self.access, self.data._wrapper_cache_key_, map_, self.unroll_map)
+
+    @property
+    def _key(self):
+        return (self.data, self._map, self._access)
+
+    def __eq__(self, other):
+        r""":class:`Arg`\s compare equal of they are defined on the same data,
+        use the same :class:`Map` with the same index and the same access
+        descriptor."""
+        return self._key == other._key
+
+    def __ne__(self, other):
+        r""":class:`Arg`\s compare equal of they are defined on the same data,
+        use the same :class:`Map` with the same index and the same access
+        descriptor."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        return "OP2 Arg: dat %s, map %s, access %s" % \
+            (self.data, self._map, self._access)
+
+    def __repr__(self):
+        return "Arg(%r, %r, %r)" % \
+            (self.data, self._map, self._access)
+
+    def __iter__(self):
+        for arg in self.split:
+            yield arg
+
+    @utils.cached_property
+    def split(self):
+        """Split a mixed argument into a tuple of constituent arguments."""
+        if self._is_mixed_dat:
+            return tuple(Arg(d, m, self._access)
+                         for d, m in zip(self.data, self._map))
+        elif self._is_mixed_mat:
+            rows, cols = self.data.sparsity.shape
+            mr, mc = self.map
+            return tuple(Arg(self.data[i, j], (mr.split[i], mc.split[j]), self._access)
+                         for i in range(rows) for j in range(cols))
+        else:
+            return (self,)
+
+    @utils.cached_property
+    def name(self):
+        """The generated argument name."""
+        return "arg%d" % self.position
+
+    @utils.cached_property
+    def ctype(self):
+        """String representing the C type of the data in this ``Arg``."""
+        return self.data.ctype
+
+    @utils.cached_property
+    def dtype(self):
+        """Numpy datatype of this Arg"""
+        return self.data.dtype
+
+    @utils.cached_property
+    def map(self):
+        """The :class:`Map` via which the data is to be accessed."""
+        return self._map
+
+    @utils.cached_property
+    def access(self):
+        """Access descriptor. One of the constants of type :class:`Access`"""
+        return self._access
+
+    @utils.cached_property
+    def _is_dat_view(self):
+        return isinstance(self.data, types.DatView)
+
+    @utils.cached_property
+    def _is_mat(self):
+        return isinstance(self.data, Mat)
+
+    @utils.cached_property
+    def _is_mixed_mat(self):
+        return self._is_mat and self.data.sparsity.shape > (1, 1)
+
+    @utils.cached_property
+    def _is_global(self):
+        return isinstance(self.data, Global)
+
+    @utils.cached_property
+    def _is_global_reduction(self):
+        return self._is_global and self._access in {Access.INC, Access.MIN, Access.MAX}
+
+    @utils.cached_property
+    def _is_dat(self):
+        return isinstance(self.data, Dat)
+
+    @utils.cached_property
+    def _is_mixed_dat(self):
+        return isinstance(self.data, MixedDat)
+
+    @utils.cached_property
+    def _is_mixed(self):
+        return self._is_mixed_dat or self._is_mixed_mat
+
+    @utils.cached_property
+    def _is_direct(self):
+        return isinstance(self.data, Dat) and self.map is None
+
+    @utils.cached_property
+    def _is_indirect(self):
+        return isinstance(self.data, Dat) and self.map is not None
+
+    @mpi.collective
+    def global_to_local_begin(self):
+        """Begin halo exchange for the argument if a halo update is required.
+        Doing halo exchanges only makes sense for :class:`Dat` objects.
+        """
+        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
+        if self._is_direct:
+            return
+        if self.access is not Access.WRITE:
+            self.data.global_to_local_begin(self.access)
+
+    @mpi.collective
+    def global_to_local_end(self):
+        """Finish halo exchange for the argument if a halo update is required.
+        Doing halo exchanges only makes sense for :class:`Dat` objects.
+        """
+        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
+        if self._is_direct:
+            return
+        if self.access is not Access.WRITE:
+            self.data.global_to_local_end(self.access)
+
+    @mpi.collective
+    def local_to_global_begin(self):
+        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
+        if self._is_direct:
+            return
+        if self.access in {Access.INC, Access.MIN, Access.MAX}:
+            self.data.local_to_global_begin(self.access)
+
+    @mpi.collective
+    def local_to_global_end(self):
+        assert self._is_dat, "Doing halo exchanges only makes sense for Dats"
+        if self._is_direct:
+            return
+        if self.access in {Access.INC, Access.MIN, Access.MAX}:
+            self.data.local_to_global_end(self.access)
+
+    @mpi.collective
+    def reduction_begin(self, comm):
+        """Begin reduction for the argument if its access is INC, MIN, or MAX.
+        Doing a reduction only makes sense for :class:`Global` objects."""
+        assert self._is_global, \
+            "Doing global reduction only makes sense for Globals"
+        if self.access is not Access.READ:
+            if self.access is Access.INC:
+                op = mpi.MPI.SUM
+            elif self.access is Access.MIN:
+                op = mpi.MPI.MIN
+            elif self.access is Access.MAX:
+                op = mpi.MPI.MAX
+            if mpi.MPI.VERSION >= 3:
+                self._reduction_req = comm.Iallreduce(self.data._data, self.data._buf, op=op)
+            else:
+                comm.Allreduce(self.data._data, self.data._buf, op=op)
+
+    @mpi.collective
+    def reduction_end(self, comm):
+        """End reduction for the argument if it is in flight.
+        Doing a reduction only makes sense for :class:`Global` objects."""
+        assert self._is_global, \
+            "Doing global reduction only makes sense for Globals"
+        if self.access is not Access.READ:
+            if mpi.MPI.VERSION >= 3:
+                self._reduction_req.Wait()
+                self._reduction_req = None
+            self.data._data[:] = self.data._buf[:]
+
+
+class JITModule(caching.Cached):
+
+    """Cached module encapsulating the generated :class:`ParLoop` stub.
+
+    .. warning::
+
+       Note to implementors.  This object is *cached* and therefore
+       should not hold any references to objects you might want to be
+       collected (such PyOP2 data objects)."""
+
+    _cppargs = []
+    _libraries = []
+    _system_headers = []
+
+    _cache = {}
+
+    @classmethod
+    def _cache_key(cls, kernel, iterset, *args, **kwargs):
+        counter = itertools.count()
+        seen = collections.defaultdict(lambda: next(counter))
+        key = ((id(mpi.dup_comm(iterset.comm)), ) + kernel._wrapper_cache_key_ + iterset._wrapper_cache_key_
+               + (iterset._extruded, (iterset._extruded and iterset.constant_layers), isinstance(iterset, Subset)))
+
+        for arg in args:
+            key += arg._wrapper_cache_key_
+            for map_ in arg.map_tuple:
+                key += (seen[map_],)
+
+        key += (kwargs.get("iterate", None), cls, conf.configuration["simd_width"])
+
+        return key
+
+    def __init__(self, kernel, iterset, *args, **kwargs):
+        r"""
+        A cached compiled function to execute for a specified par_loop.
+
+        See :func:`~.par_loop` for the description of arguments.
+
+        .. warning ::
+
+           Note to implementors.  This object is *cached*, and therefore
+           should not hold any long term references to objects that
+           you want to be collected.  In particular, after the
+           ``args`` have been inspected to produce the compiled code,
+           they **must not** remain part of the object's slots,
+           otherwise they (and the :class:`~.Dat`\s, :class:`~.Map`\s
+           and :class:`~.Mat`\s they reference) will never be collected.
+        """
+        # Return early if we were in the cache.
+        if self._initialized:
+            return
+        self.comm = iterset.comm
+        self._kernel = kernel
+        self._fun = None
+        self._iterset = iterset
+        self._args = args
+        self._iteration_region = kwargs.get('iterate', ALL)
+        self._pass_layer_arg = kwargs.get('pass_layer_arg', False)
+        # Copy the class variables, so we don't overwrite them
+        self._cppargs = copy.deepcopy(type(self)._cppargs)
+        self._libraries = copy.deepcopy(type(self)._libraries)
+        self._system_headers = copy.deepcopy(type(self)._system_headers)
+        if not kwargs.get('delay', False):
+            self.compile()
+            self._initialized = True
+
+    @mpi.collective
+    def __call__(self, *args):
+        return self._fun(*args)
+
+    @utils.cached_property
+    def _wrapper_name(self):
+        return 'wrap_%s' % self._kernel.name
+
+    @utils.cached_property
+    def code_to_compile(self):
+        from pyop2.codegen.builder import WrapperBuilder
+        from pyop2.codegen.rep2loopy import generate
+
+        builder = WrapperBuilder(kernel=self._kernel,
+                                 iterset=self._iterset,
+                                 iteration_region=self._iteration_region,
+                                 pass_layer_to_kernel=self._pass_layer_arg)
+        for arg in self._args:
+            builder.add_argument(arg)
+
+        wrapper = generate(builder)
+        code = lp.generate_code_v2(wrapper)
+
+        if self._kernel._cpp:
+            from loopy.codegen.result import process_preambles
+            preamble = "".join(process_preambles(getattr(code, "device_preambles", [])))
+            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
+            return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
+        return code.device_code()
+
+    @PETSc.Log.EventDecorator()
+    @mpi.collective
+    def compile(self):
+        # If we weren't in the cache we /must/ have arguments
+        if not hasattr(self, '_args'):
+            raise RuntimeError("JITModule has no args associated with it, should never happen")
+
+        compiler = conf.configuration["compiler"]
+        extension = "cpp" if self._kernel._cpp else "c"
+        cppargs = self._cppargs
+        cppargs += ["-I%s/include" % d for d in utils.get_petsc_dir()] + \
+                   ["-I%s" % d for d in self._kernel._include_dirs] + \
+                   ["-I%s" % os.path.abspath(os.path.dirname(__file__))]
+        ldargs = ["-L%s/lib" % d for d in utils.get_petsc_dir()] + \
+                 ["-Wl,-rpath,%s/lib" % d for d in utils.get_petsc_dir()] + \
+                 ["-lpetsc", "-lm"] + self._libraries
+        ldargs += self._kernel._ldargs
+
+        self._fun = compilation.load(self,
+                                     extension,
+                                     self._wrapper_name,
+                                     cppargs=cppargs,
+                                     ldargs=ldargs,
+                                     restype=ctypes.c_int,
+                                     compiler=compiler,
+                                     comm=self.comm)
+        # Blow away everything we don't need any more
+        del self._args
+        del self._kernel
+        del self._iterset
+
+    @utils.cached_property
+    def argtypes(self):
+        index_type = dtypes.as_ctypes(dtypes.IntType)
+        argtypes = (index_type, index_type)
+        argtypes += self._iterset._argtypes_
+        for arg in self._args:
+            argtypes += arg._argtypes_
+        seen = set()
+        for arg in self._args:
+            maps = arg.map_tuple
+            for map_ in maps:
+                for k, t in zip(map_._kernel_args_, map_._argtypes_):
+                    if k in seen:
+                        continue
+                    argtypes += (t,)
+                    seen.add(k)
+        return argtypes
+
+
+class IterationRegion(enum.IntEnum):
+    BOTTOM = 1
+    TOP = 2
+    INTERIOR_FACETS = 3
+    ALL = 4
+
+
+ON_BOTTOM = IterationRegion.BOTTOM
+"""Iterate over the cells at the bottom of the column in an extruded mesh."""
+
+ON_TOP = IterationRegion.TOP
+"""Iterate over the top cells in an extruded mesh."""
+
+ON_INTERIOR_FACETS = IterationRegion.INTERIOR_FACETS
+"""Iterate over the interior facets of an extruded mesh."""
+
+ALL = IterationRegion.ALL
+"""Iterate over all cells of an extruded mesh."""
+
+
+class ParLoop:
+    """Represents the kernel, iteration space and arguments of a parallel loop
+    invocation.
+
+    .. note ::
+
+        Users should not directly construct :class:`ParLoop` objects, but
+        use :func:`pyop2.op2.par_loop` instead.
+
+    An optional keyword argument, ``iterate``, can be used to specify
+    which region of an :class:`ExtrudedSet` the parallel loop should
+    iterate over.
+    """
+
+    @utils.validate_type(('kernel', Kernel, ex.KernelTypeError),
+                         ('iterset', Set, ex.SetTypeError))
+    def __init__(self, kernel, iterset, *args, **kwargs):
+        # INCs into globals need to start with zero and then sum back
+        # into the input global at the end.  This has the same number
+        # of reductions but means that successive par_loops
+        # incrementing into a global get the "right" value in
+        # parallel.
+        # Don't care about MIN and MAX because they commute with the reduction
+        self._reduced_globals = {}
+        for i, arg in enumerate(args):
+            if arg._is_global_reduction and arg.access == Access.INC:
+                glob = arg.data
+                tmp = Global(glob.dim, data=np.zeros_like(glob.data_ro), dtype=glob.dtype)
+                self._reduced_globals[tmp] = glob
+                args[i].data = tmp
+
+        # Always use the current arguments, also when we hit cache
+        self._actual_args = args
+        self._kernel = kernel
+        self._is_layered = iterset._extruded
+        self._iteration_region = kwargs.get("iterate", None)
+        self._pass_layer_arg = kwargs.get("pass_layer_arg", False)
+
+        check_iterset(self.args, iterset)
+
+        if self._pass_layer_arg:
+            if not self._is_layered:
+                raise ValueError("Can't request layer arg for non-extruded iteration")
+
+        self.iterset = iterset
+        self.comm = iterset.comm
+
+        for i, arg in enumerate(self._actual_args):
+            arg.position = i
+            arg.indirect_position = i
+        for i, arg1 in enumerate(self._actual_args):
+            if arg1._is_dat and arg1._is_indirect:
+                for arg2 in self._actual_args[i:]:
+                    # We have to check for identity here (we really
+                    # want these to be the same thing, not just look
+                    # the same)
+                    if arg2.data is arg1.data and arg2.map is arg1.map:
+                        arg2.indirect_position = arg1.indirect_position
+
+        self.arglist = self.prepare_arglist(iterset, *self.args)
+
+    @utils.cached_property
+    def num_flops(self):
+        iterset = self.iterset
+        size = 1
+        if iterset._extruded:
+            region = self.iteration_region
+            layers = np.mean(iterset.layers_array[:, 1] - iterset.layers_array[:, 0])
+            if region is ON_INTERIOR_FACETS:
+                size = layers - 2
+            elif region not in [ON_TOP, ON_BOTTOM]:
+                size = layers - 1
+        return size * self._kernel.num_flops
+
+    @utils.cached_property
+    def _parloop_event(self):
+        return profiling.timed_region("ParLoopExecute")
+
+    @mpi.collective
+    def compute(self):
+        """Executes the kernel over all members of the iteration space."""
+        with self._parloop_event:
+            orig_lgmaps = []
+            for arg in self.args:
+                if arg._is_mat:
+                    new_state = {Access.INC: Mat.ADD_VALUES,
+                                 Access.WRITE: Mat.INSERT_VALUES}[arg.access]
+                    for m in arg.data:
+                        m.change_assembly_state(new_state)
+                    arg.data.change_assembly_state(new_state)
+                    # Boundary conditions applied to the matrix appear
+                    # as modified lgmaps on the Arg. We set them onto
+                    # the matrix so things are correctly dropped in
+                    # insertion, and then restore the original lgmaps
+                    # afterwards.
+                    if arg.lgmaps is not None:
+                        olgmaps = []
+                        for m, lgmaps in zip(arg.data, arg.lgmaps):
+                            olgmaps.append(m.handle.getLGMap())
+                            m.handle.setLGMap(*lgmaps)
+                        orig_lgmaps.append(olgmaps)
+            self.global_to_local_begin()
+            iterset = self.iterset
+            arglist = self.arglist
+            fun = self._jitmodule
+            # Need to ensure INC globals are zero on entry to the loop
+            # in case it's reused.
+            for g in self._reduced_globals.keys():
+                g._data[...] = 0
+            self._compute(iterset.core_part, fun, *arglist)
+            self.global_to_local_end()
+            self._compute(iterset.owned_part, fun, *arglist)
+            self.reduction_begin()
+            self.local_to_global_begin()
+            self.update_arg_data_state()
+            for arg in reversed(self.args):
+                if arg._is_mat and arg.lgmaps is not None:
+                    for m, lgmaps in zip(arg.data, orig_lgmaps.pop()):
+                        m.handle.setLGMap(*lgmaps)
+            self.reduction_end()
+            self.local_to_global_end()
+
+    @mpi.collective
+    def global_to_local_begin(self):
+        """Start halo exchanges."""
+        for arg in self.unique_dat_args:
+            arg.global_to_local_begin()
+
+    @mpi.collective
+    def global_to_local_end(self):
+        """Finish halo exchanges"""
+        for arg in self.unique_dat_args:
+            arg.global_to_local_end()
+
+    @mpi.collective
+    def local_to_global_begin(self):
+        """Start halo exchanges."""
+        for arg in self.unique_dat_args:
+            arg.local_to_global_begin()
+
+    @mpi.collective
+    def local_to_global_end(self):
+        """Finish halo exchanges (wait on irecvs)"""
+        for arg in self.unique_dat_args:
+            arg.local_to_global_end()
+
+    @utils.cached_property
+    def _reduction_event_begin(self):
+        return profiling.timed_region("ParLoopRednBegin")
+
+    @utils.cached_property
+    def _reduction_event_end(self):
+        return profiling.timed_region("ParLoopRednEnd")
+
+    @utils.cached_property
+    def _has_reduction(self):
+        return len(self.global_reduction_args) > 0
+
+    @mpi.collective
+    def reduction_begin(self):
+        """Start reductions"""
+        if not self._has_reduction:
+            return
+        with self._reduction_event_begin:
+            for arg in self.global_reduction_args:
+                arg.reduction_begin(self.comm)
+
+    @mpi.collective
+    def reduction_end(self):
+        """End reductions"""
+        if not self._has_reduction:
+            return
+        with self._reduction_event_end:
+            for arg in self.global_reduction_args:
+                arg.reduction_end(self.comm)
+            # Finalise global increments
+            for tmp, glob in self._reduced_globals.items():
+                glob._data += tmp._data
+
+    @mpi.collective
+    def update_arg_data_state(self):
+        r"""Update the state of the :class:`DataCarrier`\s in the arguments to the `par_loop`.
+
+        This marks :class:`Mat`\s that need assembly."""
+        for arg in self.args:
+            access = arg.access
+            if access is Access.READ:
+                continue
+            if arg._is_dat:
+                arg.data.halo_valid = False
+            if arg._is_mat:
+                state = {Access.WRITE: Mat.INSERT_VALUES,
+                         Access.INC: Mat.ADD_VALUES}[access]
+                arg.data.assembly_state = state
+
+    @utils.cached_property
+    def dat_args(self):
+        return tuple(arg for arg in self.args if arg._is_dat)
+
+    @utils.cached_property
+    def unique_dat_args(self):
+        seen = {}
+        unique = []
+        for arg in self.dat_args:
+            if arg.data not in seen:
+                unique.append(arg)
+                seen[arg.data] = arg
+            elif arg.access != seen[arg.data].access:
+                raise ValueError("Same Dat appears multiple times with different "
+                                 "access descriptors")
+        return tuple(unique)
+
+    @utils.cached_property
+    def global_reduction_args(self):
+        return tuple(arg for arg in self.args if arg._is_global_reduction)
+
+    @utils.cached_property
+    def kernel(self):
+        """Kernel executed by this parallel loop."""
+        return self._kernel
+
+    @utils.cached_property
+    def args(self):
+        """Arguments to this parallel loop."""
+        return self._actual_args
+
+    @utils.cached_property
+    def is_layered(self):
+        """Flag which triggers extrusion"""
+        return self._is_layered
+
+    @utils.cached_property
+    def iteration_region(self):
+        """Specifies the part of the mesh the parallel loop will
+        be iterating over. The effect is the loop only iterates over
+        a certain part of an extruded mesh, for example on top cells, bottom cells or
+        interior facets."""
+        return self._iteration_region
+
+    def log_flops(self, flops):
+        PETSc.Log.logFlops(flops)
+
+    def prepare_arglist(self, iterset, *args):
+        """Prepare the argument list for calling generated code.
+
+        :arg iterset: The :class:`Set` iterated over.
+        :arg args: A list of :class:`Args`, the argument to the :fn:`par_loop`.
+        """
+        arglist = iterset._kernel_args_
+        for arg in args:
+            arglist += arg._kernel_args_
+        seen = set()
+        for arg in args:
+            maps = arg.map_tuple
+            for map_ in maps:
+                if map_ is None:
+                    continue
+                for k in map_._kernel_args_:
+                    if k in seen:
+                        continue
+                    arglist += (k,)
+                    seen.add(k)
+        return arglist
+
+    @utils.cached_property
+    def _jitmodule(self):
+        """Return the :class:`JITModule` that encapsulates the compiled par_loop code.
+
+        Return None if the child class should deal with this in another way."""
+        return JITModule(self.kernel, self.iterset, *self.args,
+                         iterate=self.iteration_region,
+                         pass_layer_arg=self._pass_layer_arg)
+
+    @utils.cached_property
+    def _compute_event(self):
+        return profiling.timed_region("ParLoop_{0}_{1}".format(self.iterset.name, self._jitmodule._wrapper_name))
+
+    @mpi.collective
+    def _compute(self, part, fun, *arglist):
+        """Executes the kernel over all members of a MPI-part of the iteration space.
+
+        :arg part: The :class:`SetPartition` to compute over
+        :arg fun: The :class:`JITModule` encapsulating the compiled
+             code (may be ignored by the backend).
+        :arg arglist: The arguments to pass to the compiled code (may
+             be ignored by the backend, depending on the exact implementation)"""
+        with self._compute_event:
+            self.log_flops(part.size * self.num_flops)
+            fun(part.offset, part.offset + part.size, *arglist)
+
+
+def check_iterset(args, iterset):
+    """Checks that the iteration set of the :class:`ParLoop` matches the
+    iteration set of all its arguments. A :class:`MapValueError` is raised
+    if this condition is not met."""
+
+    if isinstance(iterset, Subset):
+        _iterset = iterset.superset
+    else:
+        _iterset = iterset
+    if conf.configuration["type_check"]:
+        if isinstance(_iterset, MixedSet):
+            raise ex.SetTypeError("Cannot iterate over MixedSets")
+        for i, arg in enumerate(args):
+            if arg._is_global:
+                continue
+            if arg._is_direct:
+                if isinstance(_iterset, ExtrudedSet):
+                    if arg.data.dataset.set != _iterset.parent:
+                        raise ex.MapValueError(
+                            "Iterset of direct arg %s doesn't match ParLoop iterset." % i)
+                elif arg.data.dataset.set != _iterset:
+                    raise ex.MapValueError(
+                        "Iterset of direct arg %s doesn't match ParLoop iterset." % i)
+                continue
+            for j, m in enumerate(arg._map):
+                if isinstance(_iterset, ExtrudedSet):
+                    if m.iterset != _iterset and m.iterset not in _iterset:
+                        raise ex.MapValueError(
+                            "Iterset of arg %s map %s doesn't match ParLoop iterset." % (i, j))
+                elif m.iterset != _iterset and m.iterset not in _iterset:
+                    raise ex.MapValueError(
+                        "Iterset of arg %s map %s doesn't match ParLoop iterset." % (i, j))
+
+
+@mpi.collective
+def par_loop(kernel, iterset, *args, **kwargs):
+    r"""Invocation of an OP2 kernel
+
+    :arg kernel: The :class:`Kernel` to be executed.
+    :arg iterset: The iteration :class:`Set` over which the kernel should be
+                  executed.
+    :arg \*args: One or more :class:`base.Arg`\s constructed from a
+                 :class:`Global`, :class:`Dat` or :class:`Mat` using the call
+                 syntax and passing in an optionally indexed :class:`Map`
+                 through which this :class:`base.Arg` is accessed and the
+                 :class:`base.Access` descriptor indicating how the
+                 :class:`Kernel` is going to access this data (see the example
+                 below). These are the global data structures from and to
+                 which the kernel will read and write.
+    :kwarg iterate: Optionally specify which region of an
+            :class:`ExtrudedSet` to iterate over.
+            Valid values are:
+
+              - ``ON_BOTTOM``: iterate over the bottom layer of cells.
+              - ``ON_TOP`` iterate over the top layer of cells.
+              - ``ALL`` iterate over all cells (the default if unspecified)
+              - ``ON_INTERIOR_FACETS`` iterate over all the layers
+                 except the top layer, accessing data two adjacent (in
+                 the extruded direction) cells at a time.
+
+    :kwarg pass_layer_arg: Should the wrapper pass the current layer
+        into the kernel (as an ``int``). Only makes sense for
+        indirect extruded iteration.
+
+    .. warning ::
+        It is the caller's responsibility that the number and type of all
+        :class:`base.Arg`\s passed to the :func:`par_loop` match those expected
+        by the :class:`Kernel`. No runtime check is performed to ensure this!
+
+    :func:`par_loop` invocation is illustrated by the following example ::
+
+      pyop2.par_loop(mass, elements,
+                     mat(pyop2.INC, (elem_node[pyop2.i[0]]), elem_node[pyop2.i[1]]),
+                     coords(pyop2.READ, elem_node))
+
+    This example will execute the :class:`Kernel` ``mass`` over the
+    :class:`Set` ``elements`` executing 3x3 times for each
+    :class:`Set` member, assuming the :class:`Map` ``elem_node`` is of arity 3.
+    The :class:`Kernel` takes four arguments, the first is a :class:`Mat` named
+    ``mat``, the second is a field named ``coords``. The remaining two arguments
+    indicate which local iteration space point the kernel is to execute.
+
+    A :class:`Mat` requires a pair of :class:`Map` objects, one each
+    for the row and column spaces. In this case both are the same
+    ``elem_node`` map. The row :class:`Map` is indexed by the first
+    index in the local iteration space, indicated by the ``0`` index
+    to :data:`pyop2.i`, while the column space is indexed by
+    the second local index.  The matrix is accessed to increment
+    values using the ``pyop2.INC`` access descriptor.
+
+    The ``coords`` :class:`Dat` is also accessed via the ``elem_node``
+    :class:`Map`, however no indices are passed so all entries of
+    ``elem_node`` for the relevant member of ``elements`` will be
+    passed to the kernel as a vector.
+    """
+    if isinstance(kernel, types.FunctionType):
+        from pyop2 import pyparloop
+        return pyparloop.ParLoop(kernel, iterset, *args, **kwargs).compute()
+    return ParLoop(kernel, iterset, *args, **kwargs).compute()
+
+
+def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=None, wrapper_name=None):
+    """Generates wrapper for a single cell. No iteration loop, but cellwise data is extracted.
+    Cell is expected as an argument to the wrapper. For extruded, the numbering of the cells
+    is columnwise continuous, bottom to top.
+
+    :param iterset: The iteration set
+    :param args: :class:`Arg`s
+    :param forward_args: To forward unprocessed arguments to the kernel via the wrapper,
+                         give an iterable of strings describing their C types.
+    :param kernel_name: Kernel function name
+    :param wrapper_name: Wrapper function name
+
+    :return: string containing the C code for the single-cell wrapper
+    """
+    from pyop2.codegen.builder import WrapperBuilder
+    from pyop2.codegen.rep2loopy import generate
+    from loopy.types import OpaqueType
+
+    forward_arg_types = [OpaqueType(fa) for fa in forward_args]
+    empty_kernel = Kernel("", kernel_name)
+    builder = WrapperBuilder(kernel=empty_kernel,
+                             iterset=iterset, single_cell=True,
+                             forward_arg_types=forward_arg_types)
+    for arg in args:
+        builder.add_argument(arg)
+    wrapper = generate(builder, wrapper_name)
+    code = lp.generate_code_v2(wrapper)
+
+    return code.device_code()
diff --git a/pyop2/sequential.py b/pyop2/sequential.py
deleted file mode 100644
index ff8189be0..000000000
--- a/pyop2/sequential.py
+++ /dev/null
@@ -1,251 +0,0 @@
-# This file is part of PyOP2
-#
-# PyOP2 is Copyright (c) 2012, Imperial College London and
-# others. Please see the AUTHORS file in the main source directory for
-# a full list of copyright holders.  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * The name of Imperial College London or that of other
-#       contributors may not be used to endorse or promote products
-#       derived from this software without specific prior written
-#       permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
-# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-# OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""OP2 sequential backend."""
-
-import os
-from copy import deepcopy as dcopy
-
-import ctypes
-
-from pyop2.datatypes import IntType, as_ctypes
-from pyop2 import base
-from pyop2 import compilation
-from pyop2 import petsc_base
-from pyop2.base import par_loop                          # noqa: F401
-from pyop2.base import READ, WRITE, RW, INC, MIN, MAX    # noqa: F401
-from pyop2.base import ALL
-from pyop2.base import Map, MixedMap, PermutedMap, Sparsity, Halo  # noqa: F401
-from pyop2.base import Set, ExtrudedSet, MixedSet, Subset  # noqa: F401
-from pyop2.base import DatView                           # noqa: F401
-from pyop2.base import Kernel                            # noqa: F401
-from pyop2.base import Arg                               # noqa: F401
-from pyop2.petsc_base import DataSet, MixedDataSet       # noqa: F401
-from pyop2.petsc_base import Global, GlobalDataSet       # noqa: F401
-from pyop2.petsc_base import Dat, MixedDat, Mat          # noqa: F401
-from pyop2.exceptions import *  # noqa: F401
-from pyop2.mpi import collective
-from pyop2.profiling import timed_region
-from pyop2.utils import cached_property, get_petsc_dir
-
-from petsc4py import PETSc
-import loopy
-
-
-class JITModule(base.JITModule):
-
-    _cppargs = []
-    _libraries = []
-    _system_headers = []
-
-    def __init__(self, kernel, iterset, *args, **kwargs):
-        r"""
-        A cached compiled function to execute for a specified par_loop.
-
-        See :func:`~.par_loop` for the description of arguments.
-
-        .. warning ::
-
-           Note to implementors.  This object is *cached*, and therefore
-           should not hold any long term references to objects that
-           you want to be collected.  In particular, after the
-           ``args`` have been inspected to produce the compiled code,
-           they **must not** remain part of the object's slots,
-           otherwise they (and the :class:`~.Dat`\s, :class:`~.Map`\s
-           and :class:`~.Mat`\s they reference) will never be collected.
-        """
-        # Return early if we were in the cache.
-        if self._initialized:
-            return
-        self.comm = iterset.comm
-        self._kernel = kernel
-        self._fun = None
-        self._iterset = iterset
-        self._args = args
-        self._iteration_region = kwargs.get('iterate', ALL)
-        self._pass_layer_arg = kwargs.get('pass_layer_arg', False)
-        # Copy the class variables, so we don't overwrite them
-        self._cppargs = dcopy(type(self)._cppargs)
-        self._libraries = dcopy(type(self)._libraries)
-        self._system_headers = dcopy(type(self)._system_headers)
-        if not kwargs.get('delay', False):
-            self.compile()
-            self._initialized = True
-
-    @collective
-    def __call__(self, *args):
-        return self._fun(*args)
-
-    @cached_property
-    def _wrapper_name(self):
-        return 'wrap_%s' % self._kernel.name
-
-    @cached_property
-    def code_to_compile(self):
-        from pyop2.codegen.builder import WrapperBuilder
-        from pyop2.codegen.rep2loopy import generate
-
-        builder = WrapperBuilder(kernel=self._kernel,
-                                 iterset=self._iterset,
-                                 iteration_region=self._iteration_region,
-                                 pass_layer_to_kernel=self._pass_layer_arg)
-        for arg in self._args:
-            builder.add_argument(arg)
-
-        wrapper = generate(builder)
-        code = loopy.generate_code_v2(wrapper)
-
-        if self._kernel._cpp:
-            from loopy.codegen.result import process_preambles
-            preamble = "".join(process_preambles(getattr(code, "device_preambles", [])))
-            device_code = "\n\n".join(str(dp.ast) for dp in code.device_programs)
-            return preamble + "\nextern \"C\" {\n" + device_code + "\n}\n"
-        return code.device_code()
-
-    @PETSc.Log.EventDecorator()
-    @collective
-    def compile(self):
-        # If we weren't in the cache we /must/ have arguments
-        if not hasattr(self, '_args'):
-            raise RuntimeError("JITModule has no args associated with it, should never happen")
-
-        from pyop2.configuration import configuration
-
-        compiler = configuration["compiler"]
-        extension = "cpp" if self._kernel._cpp else "c"
-        cppargs = self._cppargs
-        cppargs += ["-I%s/include" % d for d in get_petsc_dir()] + \
-                   ["-I%s" % d for d in self._kernel._include_dirs] + \
-                   ["-I%s" % os.path.abspath(os.path.dirname(__file__))]
-        ldargs = ["-L%s/lib" % d for d in get_petsc_dir()] + \
-                 ["-Wl,-rpath,%s/lib" % d for d in get_petsc_dir()] + \
-                 ["-lpetsc", "-lm"] + self._libraries
-        ldargs += self._kernel._ldargs
-
-        self._fun = compilation.load(self,
-                                     extension,
-                                     self._wrapper_name,
-                                     cppargs=cppargs,
-                                     ldargs=ldargs,
-                                     restype=ctypes.c_int,
-                                     compiler=compiler,
-                                     comm=self.comm)
-        # Blow away everything we don't need any more
-        del self._args
-        del self._kernel
-        del self._iterset
-
-    @cached_property
-    def argtypes(self):
-        index_type = as_ctypes(IntType)
-        argtypes = (index_type, index_type)
-        argtypes += self._iterset._argtypes_
-        for arg in self._args:
-            argtypes += arg._argtypes_
-        seen = set()
-        for arg in self._args:
-            maps = arg.map_tuple
-            for map_ in maps:
-                for k, t in zip(map_._kernel_args_, map_._argtypes_):
-                    if k in seen:
-                        continue
-                    argtypes += (t,)
-                    seen.add(k)
-        return argtypes
-
-
-class ParLoop(petsc_base.ParLoop):
-
-    def prepare_arglist(self, iterset, *args):
-        arglist = iterset._kernel_args_
-        for arg in args:
-            arglist += arg._kernel_args_
-        seen = set()
-        for arg in args:
-            maps = arg.map_tuple
-            for map_ in maps:
-                if map_ is None:
-                    continue
-                for k in map_._kernel_args_:
-                    if k in seen:
-                        continue
-                    arglist += (k,)
-                    seen.add(k)
-        return arglist
-
-    @cached_property
-    def _jitmodule(self):
-        return JITModule(self.kernel, self.iterset, *self.args,
-                         iterate=self.iteration_region,
-                         pass_layer_arg=self._pass_layer_arg)
-
-    @cached_property
-    def _compute_event(self):
-        return timed_region("ParLoop_{0}_{1}".format(self.iterset.name, self._jitmodule._wrapper_name))
-
-    @collective
-    def _compute(self, part, fun, *arglist):
-        with self._compute_event:
-            self.log_flops(part.size * self.num_flops)
-            fun(part.offset, part.offset + part.size, *arglist)
-
-
-def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=None, wrapper_name=None):
-    """Generates wrapper for a single cell. No iteration loop, but cellwise data is extracted.
-    Cell is expected as an argument to the wrapper. For extruded, the numbering of the cells
-    is columnwise continuous, bottom to top.
-
-    :param iterset: The iteration set
-    :param args: :class:`Arg`s
-    :param forward_args: To forward unprocessed arguments to the kernel via the wrapper,
-                         give an iterable of strings describing their C types.
-    :param kernel_name: Kernel function name
-    :param wrapper_name: Wrapper function name
-
-    :return: string containing the C code for the single-cell wrapper
-    """
-    from pyop2.codegen.builder import WrapperBuilder
-    from pyop2.codegen.rep2loopy import generate
-    from loopy.types import OpaqueType
-
-    forward_arg_types = [OpaqueType(fa) for fa in forward_args]
-    empty_kernel = Kernel("", kernel_name)
-    builder = WrapperBuilder(kernel=empty_kernel,
-                             iterset=iterset, single_cell=True,
-                             forward_arg_types=forward_arg_types)
-    for arg in args:
-        builder.add_argument(arg)
-    wrapper = generate(builder, wrapper_name)
-    code = loopy.generate_code_v2(wrapper)
-
-    return code.device_code()
diff --git a/pyop2/types/__init__.py b/pyop2/types/__init__.py
new file mode 100644
index 000000000..e6aefdfe8
--- /dev/null
+++ b/pyop2/types/__init__.py
@@ -0,0 +1,9 @@
+from .access import *  # noqa: F401
+from .data_carrier import *  # noqa: F401
+from .dataset import *  # noqa: F401
+from .dat import *  # noqa: F401
+from .glob import *  # noqa: F401
+from .halo import *  # noqa: F401
+from .map import *  # noqa: F401
+from .mat import *  # noqa: F401
+from .set import *  # noqa: F401
diff --git a/pyop2/types/access.py b/pyop2/types/access.py
new file mode 100644
index 000000000..c3e2fe003
--- /dev/null
+++ b/pyop2/types/access.py
@@ -0,0 +1,37 @@
+import enum
+
+
+class Access(enum.IntEnum):
+    READ = 1
+    WRITE = 2
+    RW = 3
+    INC = 4
+    MIN = 5
+    MAX = 6
+
+
+READ = Access.READ
+"""The :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed read-only."""
+
+WRITE = Access.WRITE
+"""The  :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed write-only,
+and OP2 is not required to handle write conflicts."""
+
+RW = Access.RW
+"""The  :class:`Global`, :class:`Dat`, or :class:`Mat` is accessed for reading
+and writing, and OP2 is not required to handle write conflicts."""
+
+INC = Access.INC
+"""The kernel computes increments to be summed onto a :class:`Global`,
+:class:`Dat`, or :class:`Mat`. OP2 is responsible for managing the write
+conflicts caused."""
+
+MIN = Access.MIN
+"""The kernel contributes to a reduction into a :class:`Global` using a ``min``
+operation. OP2 is responsible for reducing over the different kernel
+invocations."""
+
+MAX = Access.MAX
+"""The kernel contributes to a reduction into a :class:`Global` using a ``max``
+operation. OP2 is responsible for reducing over the different kernel
+invocations."""
diff --git a/pyop2/types/dat.py b/pyop2/types/dat.py
new file mode 100644
index 000000000..b238f8ae1
--- /dev/null
+++ b/pyop2/types/dat.py
@@ -0,0 +1,1023 @@
+import abc
+import contextlib
+import ctypes
+import itertools
+import operator
+
+import loopy as lp
+import numpy as np
+from petsc4py import PETSc
+
+from pyop2 import (
+    configuration as conf,
+    datatypes as dtypes,
+    exceptions as ex,
+    mpi,
+    utils
+)
+from pyop2.types.access import Access
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
+from pyop2.types.set import ExtrudedSet, GlobalSet, Set
+
+
+class AbstractDat(DataCarrier, EmptyDataMixin, abc.ABC):
+    """OP2 vector data. A :class:`Dat` holds values on every element of a
+    :class:`DataSet`.
+
+    If a :class:`Set` is passed as the ``dataset`` argument, rather
+    than a :class:`DataSet`, the :class:`Dat` is created with a default
+    :class:`DataSet` dimension of 1.
+
+    If a :class:`Dat` is passed as the ``dataset`` argument, a copy is
+    returned.
+
+    It is permissible to pass `None` as the `data` argument.  In this
+    case, allocation of the data buffer is postponed until it is
+    accessed.
+
+    .. note::
+        If the data buffer is not passed in, it is implicitly
+        initialised to be zero.
+
+    When a :class:`Dat` is passed to :func:`pyop2.op2.par_loop`, the map via
+    which indirection occurs and the access descriptor are passed by
+    calling the :class:`Dat`. For instance, if a :class:`Dat` named ``D`` is
+    to be accessed for reading via a :class:`Map` named ``M``, this is
+    accomplished by ::
+
+      D(pyop2.READ, M)
+
+    The :class:`Map` through which indirection occurs can be indexed
+    using the index notation described in the documentation for the
+    :class:`Map`. Direct access to a Dat is accomplished by
+    omitting the path argument.
+
+    :class:`Dat` objects support the pointwise linear algebra operations
+    ``+=``, ``*=``, ``-=``, ``/=``, where ``*=`` and ``/=`` also support
+    multiplication / division by a scalar.
+    """
+
+    _zero_kernels = {}
+    """Class-level cache for zero kernels."""
+
+    _modes = [Access.READ, Access.WRITE, Access.RW, Access.INC, Access.MIN, Access.MAX]
+
+    @utils.cached_property
+    def pack(self):
+        from pyop2.codegen.builder import DatPack
+        return DatPack
+
+    @utils.validate_type(('dataset', (DataCarrier, DataSet, Set), ex.DataSetTypeError),
+                         ('name', str, ex.NameTypeError))
+    @utils.validate_dtype(('dtype', None, ex.DataTypeError))
+    def __init__(self, dataset, data=None, dtype=None, name=None):
+
+        if isinstance(dataset, Dat):
+            self.__init__(dataset.dataset, None, dtype=dataset.dtype,
+                          name="copy_of_%s" % dataset.name)
+            dataset.copy(self)
+            return
+        if type(dataset) is Set or type(dataset) is ExtrudedSet:
+            # If a Set, rather than a dataset is passed in, default to
+            # a dataset dimension of 1.
+            dataset = dataset ** 1
+        self._shape = (dataset.total_size,) + (() if dataset.cdim == 1 else dataset.dim)
+        EmptyDataMixin.__init__(self, data, dtype, self._shape)
+
+        self._dataset = dataset
+        self.comm = dataset.comm
+        self.halo_valid = True
+        self._name = name or "dat_#x%x" % id(self)
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return (self._data.ctypes.data, )
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return (ctypes.c_voidp, )
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.dtype, self._dataset._wrapper_cache_key_)
+
+    @utils.validate_in(('access', _modes, ex.ModeValueError))
+    def __call__(self, access, path=None):
+        from pyop2.parloop import Arg
+        if conf.configuration["type_check"] and path and path.toset != self.dataset.set:
+            raise ex.MapValueError("To Set of Map does not match Set of Dat.")
+        return Arg(data=self, map=path, access=access)
+
+    def __getitem__(self, idx):
+        """Return self if ``idx`` is 0, raise an error otherwise."""
+        if idx != 0:
+            raise ex.IndexValueError("Can only extract component 0 from %r" % self)
+        return self
+
+    @utils.cached_property
+    def split(self):
+        """Tuple containing only this :class:`Dat`."""
+        return (self,)
+
+    @utils.cached_property
+    def dataset(self):
+        """:class:`DataSet` on which the Dat is defined."""
+        return self._dataset
+
+    @utils.cached_property
+    def dim(self):
+        """The shape of the values for each element of the object."""
+        return self.dataset.dim
+
+    @utils.cached_property
+    def cdim(self):
+        """The scalar number of values for each member of the object. This is
+        the product of the dim tuple."""
+        return self.dataset.cdim
+
+    @property
+    @mpi.collective
+    def data(self):
+        """Numpy array containing the data values.
+
+        With this accessor you are claiming that you will modify
+        the values you get back.  If you only need to look at the
+        values, use :meth:`data_ro` instead.
+
+        This only shows local values, to see the halo values too use
+        :meth:`data_with_halos`.
+
+        """
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+        self.halo_valid = False
+        v = self._data[:self.dataset.size].view()
+        v.setflags(write=True)
+        return v
+
+    @property
+    @mpi.collective
+    def data_with_halos(self):
+        r"""A view of this :class:`Dat`\s data.
+
+        This accessor marks the :class:`Dat` as dirty, see
+        :meth:`data` for more details on the semantics.
+
+        With this accessor, you get to see up to date halo values, but
+        you should not try and modify them, because they will be
+        overwritten by the next halo exchange."""
+        self.global_to_local_begin(Access.Access.RW)
+        self.global_to_local_end(Access.RW)
+        self.halo_valid = False
+        v = self._data.view()
+        v.setflags(write=True)
+        return v
+
+    @property
+    @mpi.collective
+    def data_ro(self):
+        """Numpy array containing the data values.  Read-only.
+
+        With this accessor you are not allowed to modify the values
+        you get back.  If you need to do so, use :meth:`data` instead.
+
+        This only shows local values, to see the halo values too use
+        :meth:`data_ro_with_halos`.
+
+        """
+        if self.dataset.total_size > 0 and self._data.size == 0 and self.cdim > 0:
+            raise RuntimeError("Illegal access: no data associated with this Dat!")
+        v = self._data[:self.dataset.size].view()
+        v.setflags(write=False)
+        return v
+
+    @property
+    @mpi.collective
+    def data_ro_with_halos(self):
+        r"""A view of this :class:`Dat`\s data.
+
+        This accessor does not mark the :class:`Dat` as dirty, and is
+        a read only view, see :meth:`data_ro` for more details on the
+        semantics.
+
+        With this accessor, you get to see up to date halo values, but
+        you should not try and modify them, because they will be
+        overwritten by the next halo exchange.
+
+        """
+        self.global_to_local_begin(Access.READ)
+        self.global_to_local_end(Access.READ)
+        v = self._data.view()
+        v.setflags(write=False)
+        return v
+
+    def save(self, filename):
+        """Write the data array to file ``filename`` in NumPy format."""
+        np.save(filename, self.data_ro)
+
+    def load(self, filename):
+        """Read the data stored in file ``filename`` into a NumPy array
+        and store the values in :meth:`_data`.
+        """
+        # The np.save method appends a .npy extension to the file name
+        # if the user has not supplied it. However, np.load does not,
+        # so we need to handle this ourselves here.
+        if(filename[-4:] != ".npy"):
+            filename = filename + ".npy"
+
+        if isinstance(self.data, tuple):
+            # MixedDat case
+            for d, d_from_file in zip(self.data, np.load(filename)):
+                d[:] = d_from_file[:]
+        else:
+            self.data[:] = np.load(filename)
+
+    @utils.cached_property
+    def shape(self):
+        return self._shape
+
+    @utils.cached_property
+    def dtype(self):
+        return self._dtype
+
+    @utils.cached_property
+    def nbytes(self):
+        """Return an estimate of the size of the data associated with this
+        :class:`Dat` in bytes. This will be the correct size of the data
+        payload, but does not take into account the (presumably small)
+        overhead of the object and its metadata.
+
+        Note that this is the process local memory usage, not the sum
+        over all MPI processes.
+        """
+
+        return self.dtype.itemsize * self.dataset.total_size * self.dataset.cdim
+
+    @mpi.collective
+    def zero(self, subset=None):
+        """Zero the data associated with this :class:`Dat`
+
+        :arg subset: A :class:`Subset` of entries to zero (optional)."""
+        # If there is no subset we can safely zero the halo values.
+        if subset is None:
+            self._data[:] = 0
+            self.halo_valid = True
+        elif subset.superset != self.dataset.set:
+            raise ex.MapValueError("The subset and dataset are incompatible")
+        else:
+            self.data[subset.owned_indices] = 0
+
+    @mpi.collective
+    def copy(self, other, subset=None):
+        """Copy the data in this :class:`Dat` into another.
+
+        :arg other: The destination :class:`Dat`
+        :arg subset: A :class:`Subset` of elements to copy (optional)"""
+        if other is self:
+            return
+        if subset is None:
+            # If the current halo is valid we can also copy these values across.
+            if self.halo_valid:
+                other._data[:] = self._data
+                other.halo_valid = True
+            else:
+                other.data[:] = self.data_ro
+        elif subset.superset != self.dataset.set:
+            raise ex.MapValueError("The subset and dataset are incompatible")
+        else:
+            other.data[subset.owned_indices] = self.data_ro[subset.owned_indices]
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __str__(self):
+        return "OP2 Dat: %s on (%s) with datatype %s" \
+               % (self._name, self._dataset, self.dtype.name)
+
+    def __repr__(self):
+        return "Dat(%r, None, %r, %r)" \
+               % (self._dataset, self.dtype, self._name)
+
+    def _check_shape(self, other):
+        if other.dataset.dim != self.dataset.dim:
+            raise ValueError('Mismatched shapes in operands %s and %s',
+                             self.dataset.dim, other.dataset.dim)
+
+    def _op_kernel(self, op, globalp, dtype):
+        from pyop2.kernel import Kernel
+        key = (op, globalp, dtype)
+        try:
+            if not hasattr(self, "_op_kernel_cache"):
+                self._op_kernel_cache = {}
+            return self._op_kernel_cache[key]
+        except KeyError:
+            pass
+        import islpy as isl
+        import pymbolic.primitives as p
+        name = "binop_%s" % op.__name__
+        inames = isl.make_zero_and_vars(["i"])
+        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
+        _other = p.Variable("other")
+        _self = p.Variable("self")
+        _ret = p.Variable("ret")
+        i = p.Variable("i")
+        lhs = _ret.index(i)
+        if globalp:
+            rhs = _other.index(0)
+            rshape = (1, )
+        else:
+            rhs = _other.index(i)
+            rshape = (self.cdim, )
+        insn = lp.Assignment(lhs, op(_self.index(i), rhs), within_inames=frozenset(["i"]))
+        data = [lp.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)),
+                lp.GlobalArg("other", dtype=dtype, shape=rshape),
+                lp.GlobalArg("ret", dtype=self.dtype, shape=(self.cdim,))]
+        knl = lp.make_function([domain], [insn], data, name=name, target=lp.CTarget(), lang_version=(2018, 2))
+        return self._op_kernel_cache.setdefault(key, Kernel(knl, name))
+
+    def _op(self, other, op):
+        from pyop2.parloop import par_loop
+        from pyop2.types.glob import Global
+        ret = Dat(self.dataset, None, self.dtype)
+        if np.isscalar(other):
+            other = Global(1, data=other)
+            globalp = True
+        else:
+            self._check_shape(other)
+            globalp = False
+        par_loop(self._op_kernel(op, globalp, other.dtype),
+                 self.dataset.set, self(Access.READ), other(Access.READ), ret(Access.WRITE))
+        return ret
+
+    def _iop_kernel(self, op, globalp, other_is_self, dtype):
+        key = (op, globalp, other_is_self, dtype)
+        try:
+            if not hasattr(self, "_iop_kernel_cache"):
+                self._iop_kernel_cache = {}
+            return self._iop_kernel_cache[key]
+        except KeyError:
+            pass
+        import islpy as isl
+        import pymbolic.primitives as p
+        from pyop2.parloop import Kernel
+        name = "iop_%s" % op.__name__
+        inames = isl.make_zero_and_vars(["i"])
+        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
+        _other = p.Variable("other")
+        _self = p.Variable("self")
+        i = p.Variable("i")
+        lhs = _self.index(i)
+        rshape = (self.cdim, )
+        if globalp:
+            rhs = _other.index(0)
+            rshape = (1, )
+        elif other_is_self:
+            rhs = _self.index(i)
+        else:
+            rhs = _other.index(i)
+        insn = lp.Assignment(lhs, op(lhs, rhs), within_inames=frozenset(["i"]))
+        data = [lp.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,))]
+        if not other_is_self:
+            data.append(lp.GlobalArg("other", dtype=dtype, shape=rshape))
+        knl = lp.make_function([domain], [insn], data, name=name, target=lp.CTarget(), lang_version=(2018, 2))
+        return self._iop_kernel_cache.setdefault(key, Kernel(knl, name))
+
+    def _iop(self, other, op):
+        from pyop2.parloop import par_loop
+        from pyop2.types.glob import Global
+        globalp = False
+        if np.isscalar(other):
+            other = Global(1, data=other)
+            globalp = True
+        elif other is not self:
+            self._check_shape(other)
+        args = [self(Access.INC)]
+        if other is not self:
+            args.append(other(Access.READ))
+        par_loop(self._iop_kernel(op, globalp, other is self, other.dtype), self.dataset.set, *args)
+        return self
+
+    def _inner_kernel(self, dtype):
+        try:
+            if not hasattr(self, "_inner_kernel_cache"):
+                self._inner_kernel_cache = {}
+            return self._inner_kernel_cache[dtype]
+        except KeyError:
+            pass
+        import islpy as isl
+        import pymbolic.primitives as p
+        from pyop2.kernel import Kernel
+        inames = isl.make_zero_and_vars(["i"])
+        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
+        _self = p.Variable("self")
+        _other = p.Variable("other")
+        _ret = p.Variable("ret")
+        _conj = p.Variable("conj") if dtype.kind == "c" else lambda x: x
+        i = p.Variable("i")
+        insn = lp.Assignment(_ret[0], _ret[0] + _self[i]*_conj(_other[i]),
+                             within_inames=frozenset(["i"]))
+        data = [lp.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)),
+                lp.GlobalArg("other", dtype=dtype, shape=(self.cdim,)),
+                lp.GlobalArg("ret", dtype=self.dtype, shape=(1,))]
+        knl = lp.make_function([domain], [insn], data, name="inner", target=lp.CTarget(), lang_version=(2018, 2))
+        k = Kernel(knl, "inner")
+        return self._inner_kernel_cache.setdefault(dtype, k)
+
+    def inner(self, other):
+        """Compute the l2 inner product of the flattened :class:`Dat`
+
+        :arg other: the other :class:`Dat` to compute the inner
+             product against. The complex conjugate of this is taken.
+
+        """
+        from pyop2.parloop import par_loop
+        from pyop2.types.glob import Global
+        self._check_shape(other)
+        ret = Global(1, data=0, dtype=self.dtype)
+        par_loop(self._inner_kernel(other.dtype), self.dataset.set,
+                 self(Access.READ), other(Access.READ), ret(Access.INC))
+        return ret.data_ro[0]
+
+    @property
+    def norm(self):
+        """Compute the l2 norm of this :class:`Dat`
+
+        .. note::
+
+           This acts on the flattened data (see also :meth:`inner`)."""
+        from math import sqrt
+        return sqrt(self.inner(self).real)
+
+    def __pos__(self):
+        pos = Dat(self)
+        return pos
+
+    def __add__(self, other):
+        """Pointwise addition of fields."""
+        return self._op(other, operator.add)
+
+    def __radd__(self, other):
+        """Pointwise addition of fields.
+
+        self.__radd__(other) <==> other + self."""
+        return self + other
+
+    @utils.cached_property
+    def _neg_kernel(self):
+        # Copy and negate in one go.
+        import islpy as isl
+        import pymbolic.primitives as p
+        from pyop2.kernel import Kernel
+        name = "neg"
+        inames = isl.make_zero_and_vars(["i"])
+        domain = (inames[0].le_set(inames["i"])) & (inames["i"].lt_set(inames[0] + self.cdim))
+        lvalue = p.Variable("other")
+        rvalue = p.Variable("self")
+        i = p.Variable("i")
+        insn = lp.Assignment(lvalue.index(i), -rvalue.index(i), within_inames=frozenset(["i"]))
+        data = [lp.GlobalArg("other", dtype=self.dtype, shape=(self.cdim,)),
+                lp.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,))]
+        knl = lp.make_function([domain], [insn], data, name=name, target=lp.CTarget(), lang_version=(2018, 2))
+        return Kernel(knl, name)
+
+    def __neg__(self):
+        from pyop2.parloop import par_loop
+        neg = Dat(self.dataset, dtype=self.dtype)
+        par_loop(self._neg_kernel, self.dataset.set, neg(Access.WRITE), self(Access.READ))
+        return neg
+
+    def __sub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._op(other, operator.sub)
+
+    def __rsub__(self, other):
+        """Pointwise subtraction of fields.
+
+        self.__rsub__(other) <==> other - self."""
+        ret = -self
+        ret += other
+        return ret
+
+    def __mul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._op(other, operator.mul)
+
+    def __rmul__(self, other):
+        """Pointwise multiplication or scaling of fields.
+
+        self.__rmul__(other) <==> other * self."""
+        return self.__mul__(other)
+
+    def __truediv__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._op(other, operator.truediv)
+
+    __div__ = __truediv__  # Python 2 compatibility
+
+    def __iadd__(self, other):
+        """Pointwise addition of fields."""
+        return self._iop(other, operator.iadd)
+
+    def __isub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._iop(other, operator.isub)
+
+    def __imul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._iop(other, operator.imul)
+
+    def __itruediv__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._iop(other, operator.itruediv)
+
+    @mpi.collective
+    def global_to_local_begin(self, access_mode):
+        """Begin a halo exchange from global to ghosted representation.
+
+        :kwarg access_mode: Mode with which the data will subsequently
+           be accessed."""
+        halo = self.dataset.halo
+        if halo is None:
+            return
+        if not self.halo_valid and access_mode in {Access.READ, Access.RW}:
+            halo.global_to_local_begin(self, Access.WRITE)
+        elif access_mode in {Access.INC, Access.MIN, Access.MAX}:
+            min_, max_ = dtypes.dtype_limits(self.dtype)
+            val = {Access.MAX: min_, Access.MIN: max_, Access.INC: 0}[access_mode]
+            self._data[self.dataset.size:] = val
+        else:
+            # WRITE
+            pass
+
+    @mpi.collective
+    def global_to_local_end(self, access_mode):
+        """End a halo exchange from global to ghosted representation.
+
+        :kwarg access_mode: Mode with which the data will subsequently
+           be accessed."""
+        halo = self.dataset.halo
+        if halo is None:
+            return
+        if not self.halo_valid and access_mode in {Access.READ, Access.RW}:
+            halo.global_to_local_end(self, Access.WRITE)
+            self.halo_valid = True
+        elif access_mode in {Access.INC, Access.MIN, Access.MAX}:
+            self.halo_valid = False
+        else:
+            # WRITE
+            pass
+
+    @mpi.collective
+    def local_to_global_begin(self, insert_mode):
+        """Begin a halo exchange from ghosted to global representation.
+
+        :kwarg insert_mode: insertion mode (an access descriptor)"""
+        halo = self.dataset.halo
+        if halo is None:
+            return
+        halo.local_to_global_begin(self, insert_mode)
+
+    @mpi.collective
+    def local_to_global_end(self, insert_mode):
+        """End a halo exchange from ghosted to global representation.
+
+        :kwarg insert_mode: insertion mode (an access descriptor)"""
+        halo = self.dataset.halo
+        if halo is None:
+            return
+        halo.local_to_global_end(self, insert_mode)
+        self.halo_valid = False
+
+
+class DatView(AbstractDat):
+    """An indexed view into a :class:`Dat`.
+
+    This object can be used like a :class:`Dat` but the kernel will
+    only see the requested index, rather than the full data.
+
+    :arg dat: The :class:`Dat` to create a view into.
+    :arg index: The component to select a view of.
+    """
+    def __init__(self, dat, index):
+        index = utils.as_tuple(index)
+        assert len(index) == len(dat.dim)
+        for i, d in zip(index, dat.dim):
+            if not (0 <= i < d):
+                raise ex.IndexValueError("Can't create DatView with index %s for Dat with shape %s" % (index, dat.dim))
+        self.index = index
+        # Point at underlying data
+        super(DatView, self).__init__(dat.dataset,
+                                      dat._data,
+                                      dtype=dat.dtype,
+                                      name="view[%s](%s)" % (index, dat.name))
+        self._parent = dat
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return self._parent._kernel_args_
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return self._parent._argtypes_
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.index, self._parent._wrapper_cache_key_)
+
+    @utils.cached_property
+    def cdim(self):
+        return 1
+
+    @utils.cached_property
+    def dim(self):
+        return (1, )
+
+    @utils.cached_property
+    def shape(self):
+        return (self.dataset.total_size, )
+
+    @property
+    def data(self):
+        full = self._parent.data
+        idx = (slice(None), *self.index)
+        return full[idx]
+
+    @property
+    def data_ro(self):
+        full = self._parent.data_ro
+        idx = (slice(None), *self.index)
+        return full[idx]
+
+    @property
+    def data_with_halos(self):
+        full = self._parent.data_with_halos
+        idx = (slice(None), *self.index)
+        return full[idx]
+
+    @property
+    def data_ro_with_halos(self):
+        full = self._parent.data_ro_with_halos
+        idx = (slice(None), *self.index)
+        return full[idx]
+
+
+class Dat(AbstractDat, VecAccessMixin):
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        size = self.dataset.layout_vec.getSizes()
+        data = self._data[:size[0]]
+        return PETSc.Vec().createWithArray(data, size=size, bsize=self.cdim, comm=self.comm)
+
+    @contextlib.contextmanager
+    def vec_context(self, access):
+        r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`.
+
+        :param access: Access descriptor: READ, WRITE, or RW."""
+        # PETSc Vecs have a state counter and cache norm computations
+        # to return immediately if the state counter is unchanged.
+        # Since we've updated the data behind their back, we need to
+        # change that state counter.
+        self._vec.stateIncrease()
+        yield self._vec
+        if access is not Access.READ:
+            self.halo_valid = False
+
+
+class MixedDat(AbstractDat, VecAccessMixin):
+    r"""A container for a bag of :class:`Dat`\s.
+
+    Initialized either from a :class:`MixedDataSet`, a :class:`MixedSet`, or
+    an iterable of :class:`DataSet`\s and/or :class:`Set`\s, where all the
+    :class:`Set`\s are implcitly upcast to :class:`DataSet`\s ::
+
+        mdat = op2.MixedDat(mdset)
+        mdat = op2.MixedDat([dset1, ..., dsetN])
+
+    or from an iterable of :class:`Dat`\s ::
+
+        mdat = op2.MixedDat([dat1, ..., datN])
+    """
+
+    def __init__(self, mdset_or_dats):
+        from pyop2.types.glob import Global
+
+        def what(x):
+            if isinstance(x, (Global, GlobalDataSet, GlobalSet)):
+                return Global
+            elif isinstance(x, (Dat, DataSet, Set)):
+                return Dat
+            else:
+                raise ex.DataSetTypeError("Huh?!")
+
+        if isinstance(mdset_or_dats, MixedDat):
+            self._dats = tuple(what(d)(d) for d in mdset_or_dats)
+        else:
+            self._dats = tuple(d if isinstance(d, (Dat, Global)) else what(d)(d) for d in mdset_or_dats)
+        if not all(d.dtype == self._dats[0].dtype for d in self._dats):
+            raise ex.DataValueError('MixedDat with different dtypes is not supported')
+        # TODO: Think about different communicators on dats (c.f. MixedSet)
+        self.comm = self._dats[0].comm
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return tuple(itertools.chain(*(d._kernel_args_ for d in self)))
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return tuple(itertools.chain(*(d._argtypes_ for d in self)))
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self),) + tuple(d._wrapper_cache_key_ for d in self)
+
+    def __getitem__(self, idx):
+        """Return :class:`Dat` with index ``idx`` or a given slice of Dats."""
+        return self._dats[idx]
+
+    @utils.cached_property
+    def dtype(self):
+        """The NumPy dtype of the data."""
+        return self._dats[0].dtype
+
+    @utils.cached_property
+    def split(self):
+        r"""The underlying tuple of :class:`Dat`\s."""
+        return self._dats
+
+    @utils.cached_property
+    def dataset(self):
+        r""":class:`MixedDataSet`\s this :class:`MixedDat` is defined on."""
+        return MixedDataSet(tuple(s.dataset for s in self._dats))
+
+    @utils.cached_property
+    def _data(self):
+        """Return the user-provided data buffer, or a zeroed buffer of
+        the correct size if none was provided."""
+        return tuple(d._data for d in self)
+
+    @property
+    @mpi.collective
+    def data(self):
+        """Numpy arrays containing the data excluding halos."""
+        return tuple(s.data for s in self._dats)
+
+    @property
+    @mpi.collective
+    def data_with_halos(self):
+        """Numpy arrays containing the data including halos."""
+        return tuple(s.data_with_halos for s in self._dats)
+
+    @property
+    @mpi.collective
+    def data_ro(self):
+        """Numpy arrays with read-only data excluding halos."""
+        return tuple(s.data_ro for s in self._dats)
+
+    @property
+    @mpi.collective
+    def data_ro_with_halos(self):
+        """Numpy arrays with read-only data including halos."""
+        return tuple(s.data_ro_with_halos for s in self._dats)
+
+    @property
+    def halo_valid(self):
+        """Does this Dat have up to date halos?"""
+        return all(s.halo_valid for s in self)
+
+    @halo_valid.setter
+    def halo_valid(self, val):
+        """Indictate whether this Dat requires a halo update"""
+        for d in self:
+            d.halo_valid = val
+
+    @mpi.collective
+    def global_to_local_begin(self, access_mode):
+        for s in self:
+            s.global_to_local_begin(access_mode)
+
+    @mpi.collective
+    def global_to_local_end(self, access_mode):
+        for s in self:
+            s.global_to_local_end(access_mode)
+
+    @mpi.collective
+    def local_to_global_begin(self, insert_mode):
+        for s in self:
+            s.local_to_global_begin(insert_mode)
+
+    @mpi.collective
+    def local_to_global_end(self, insert_mode):
+        for s in self:
+            s.local_to_global_end(insert_mode)
+
+    @mpi.collective
+    def zero(self, subset=None):
+        """Zero the data associated with this :class:`MixedDat`.
+
+        :arg subset: optional subset of entries to zero (not implemented)."""
+        if subset is not None:
+            raise NotImplementedError("Subsets of mixed sets not implemented")
+        for d in self._dats:
+            d.zero()
+
+    @utils.cached_property
+    def nbytes(self):
+        """Return an estimate of the size of the data associated with this
+        :class:`MixedDat` in bytes. This will be the correct size of the data
+        payload, but does not take into account the (presumably small)
+        overhead of the object and its metadata.
+
+        Note that this is the process local memory usage, not the sum
+        over all MPI processes.
+        """
+
+        return np.sum([d.nbytes for d in self._dats])
+
+    @mpi.collective
+    def copy(self, other, subset=None):
+        """Copy the data in this :class:`MixedDat` into another.
+
+        :arg other: The destination :class:`MixedDat`
+        :arg subset: Subsets are not supported, this must be :class:`None`"""
+
+        if subset is not None:
+            raise NotImplementedError("MixedDat.copy with a Subset is not supported")
+        for s, o in zip(self, other):
+            s.copy(o)
+
+    def __iter__(self):
+        r"""Yield all :class:`Dat`\s when iterated over."""
+        for d in self._dats:
+            yield d
+
+    def __len__(self):
+        r"""Return number of contained :class:`Dats`\s."""
+        return len(self._dats)
+
+    def __hash__(self):
+        return hash(self._dats)
+
+    def __eq__(self, other):
+        r""":class:`MixedDat`\s are equal if all their contained :class:`Dat`\s
+        are."""
+        return type(self) == type(other) and self._dats == other._dats
+
+    def __ne__(self, other):
+        r""":class:`MixedDat`\s are equal if all their contained :class:`Dat`\s
+        are."""
+        return not self.__eq__(other)
+
+    def __str__(self):
+        return "OP2 MixedDat composed of Dats: %s" % (self._dats,)
+
+    def __repr__(self):
+        return "MixedDat(%r)" % (self._dats,)
+
+    def inner(self, other):
+        """Compute the l2 inner product.
+
+        :arg other: the other :class:`MixedDat` to compute the inner product against"""
+        ret = 0
+        for s, o in zip(self, other):
+            ret += s.inner(o)
+        return ret
+
+    def _op(self, other, op):
+        ret = []
+        if np.isscalar(other):
+            for s in self:
+                ret.append(op(s, other))
+        else:
+            self._check_shape(other)
+            for s, o in zip(self, other):
+                ret.append(op(s, o))
+        return MixedDat(ret)
+
+    def _iop(self, other, op):
+        if np.isscalar(other):
+            for s in self:
+                op(s, other)
+        else:
+            self._check_shape(other)
+            for s, o in zip(self, other):
+                op(s, o)
+        return self
+
+    def __pos__(self):
+        ret = []
+        for s in self:
+            ret.append(s.__pos__())
+        return MixedDat(ret)
+
+    def __neg__(self):
+        ret = []
+        for s in self:
+            ret.append(s.__neg__())
+        return MixedDat(ret)
+
+    def __add__(self, other):
+        """Pointwise addition of fields."""
+        return self._op(other, operator.add)
+
+    def __radd__(self, other):
+        """Pointwise addition of fields.
+
+        self.__radd__(other) <==> other + self."""
+        return self._op(other, operator.add)
+
+    def __sub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._op(other, operator.sub)
+
+    def __rsub__(self, other):
+        """Pointwise subtraction of fields.
+
+        self.__rsub__(other) <==> other - self."""
+        return self._op(other, operator.sub)
+
+    def __mul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._op(other, operator.mul)
+
+    def __rmul__(self, other):
+        """Pointwise multiplication or scaling of fields.
+
+        self.__rmul__(other) <==> other * self."""
+        return self._op(other, operator.mul)
+
+    def __div__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._op(other, operator.div)
+
+    def __iadd__(self, other):
+        """Pointwise addition of fields."""
+        return self._iop(other, operator.iadd)
+
+    def __isub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._iop(other, operator.isub)
+
+    def __imul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._iop(other, operator.imul)
+
+    def __idiv__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._iop(other, operator.idiv)
+
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
+        # In this case we can just duplicate the layout vec
+        # because we're not placing an array.
+        return self.dataset.layout_vec.duplicate()
+
+    @contextlib.contextmanager
+    def vec_context(self, access):
+        r"""A context manager scattering the arrays of all components of this
+        :class:`MixedDat` into a contiguous :class:`PETSc.Vec` and reverse
+        scattering to the original arrays when exiting the context.
+
+        :param access: Access descriptor: READ, WRITE, or RW.
+
+        .. note::
+
+           The :class:`~PETSc.Vec` obtained from this context is in
+           the correct order to be left multiplied by a compatible
+           :class:`MixedMat`.  In parallel it is *not* just a
+           concatenation of the underlying :class:`Dat`\s."""
+        # Do the actual forward scatter to fill the full vector with
+        # values
+        if access is not Access.WRITE:
+            offset = 0
+            array = self._vec.array
+            for d in self:
+                with d.vec_ro as v:
+                    size = v.local_size
+                    array[offset:offset+size] = v.array_r[:]
+                    offset += size
+        self._vec.stateIncrease()
+        yield self._vec
+        if access is not Access.READ:
+            # Reverse scatter to get the values back to their original locations
+            offset = 0
+            array = self._vec.array_r
+            for d in self:
+                with d.vec_wo as v:
+                    size = v.local_size
+                    v.array[:] = array[offset:offset+size]
+                    offset += size
+            self.halo_valid = False
diff --git a/pyop2/types/data_carrier.py b/pyop2/types/data_carrier.py
new file mode 100644
index 000000000..78a268a84
--- /dev/null
+++ b/pyop2/types/data_carrier.py
@@ -0,0 +1,109 @@
+import abc
+
+import numpy as np
+
+from pyop2 import (
+    datatypes as dtypes,
+    mpi,
+    utils
+)
+from pyop2.types.access import Access
+
+
+class DataCarrier(abc.ABC):
+
+    """Abstract base class for OP2 data.
+
+    Actual objects will be :class:`DataCarrier` objects of rank 0
+    (:class:`Global`), rank 1 (:class:`Dat`), or rank 2
+    (:class:`Mat`)"""
+
+    @utils.cached_property
+    def dtype(self):
+        """The Python type of the data."""
+        return self._data.dtype
+
+    @utils.cached_property
+    def ctype(self):
+        """The c type of the data."""
+        return dtypes.as_cstr(self.dtype)
+
+    @utils.cached_property
+    def name(self):
+        """User-defined label."""
+        return self._name
+
+    @utils.cached_property
+    def dim(self):
+        """The shape tuple of the values for each element of the object."""
+        return self._dim
+
+    @utils.cached_property
+    def cdim(self):
+        """The scalar number of values for each member of the object. This is
+        the product of the dim tuple."""
+        return self._cdim
+
+
+class EmptyDataMixin(abc.ABC):
+    """A mixin for :class:`Dat` and :class:`Global` objects that takes
+    care of allocating data on demand if the user has passed nothing
+    in.
+
+    Accessing the :attr:`_data` property allocates a zeroed data array
+    if it does not already exist.
+    """
+    def __init__(self, data, dtype, shape):
+        if data is None:
+            self._dtype = np.dtype(dtype if dtype is not None else dtypes.ScalarType)
+        else:
+            self._numpy_data = utils.verify_reshape(data, dtype, shape, allow_none=True)
+            self._dtype = self._data.dtype
+
+    @utils.cached_property
+    def _data(self):
+        """Return the user-provided data buffer, or a zeroed buffer of
+        the correct size if none was provided."""
+        if not self._is_allocated:
+            self._numpy_data = np.zeros(self.shape, dtype=self._dtype)
+        return self._numpy_data
+
+    @property
+    def _is_allocated(self):
+        """Return True if the data buffer has been allocated."""
+        return hasattr(self, '_numpy_data')
+
+
+class VecAccessMixin(abc.ABC):
+    @abc.abstractmethod
+    def vec_context(self, access):
+        pass
+
+    @abc.abstractproperty
+    def _vec(self):
+        pass
+
+    @property
+    @mpi.collective
+    def vec(self):
+        """Context manager for a PETSc Vec appropriate for this Dat.
+
+        You're allowed to modify the data you get back from this view."""
+        return self.vec_context(access=Access.RW)
+
+    @property
+    @mpi.collective
+    def vec_wo(self):
+        """Context manager for a PETSc Vec appropriate for this Dat.
+
+        You're allowed to modify the data you get back from this view,
+        but you cannot read from it."""
+        return self.vec_context(access=Access.WRITE)
+
+    @property
+    @mpi.collective
+    def vec_ro(self):
+        """Context manager for a PETSc Vec appropriate for this Dat.
+
+        You're not allowed to modify the data you get back from this view."""
+        return self.vec_context(access=Access.READ)
diff --git a/pyop2/types/dataset.py b/pyop2/types/dataset.py
new file mode 100644
index 000000000..635b130e3
--- /dev/null
+++ b/pyop2/types/dataset.py
@@ -0,0 +1,531 @@
+import numbers
+
+import numpy as np
+from petsc4py import PETSc
+
+from pyop2 import (
+    caching,
+    datatypes as dtypes,
+    exceptions as ex,
+    mpi,
+    utils
+)
+from pyop2.types.set import ExtrudedSet, GlobalSet, MixedSet, Set, Subset
+
+
+class DataSet(caching.ObjectCached):
+    """PyOP2 Data Set
+
+    Set used in the op2.Dat structures to specify the dimension of the data.
+    """
+
+    @utils.validate_type(('iter_set', Set, ex.SetTypeError),
+                         ('dim', (numbers.Integral, tuple, list), ex.DimTypeError),
+                         ('name', str, ex.NameTypeError))
+    def __init__(self, iter_set, dim=1, name=None):
+        if isinstance(iter_set, ExtrudedSet):
+            raise NotImplementedError("Not allowed!")
+        if self._initialized:
+            return
+        if isinstance(iter_set, Subset):
+            raise NotImplementedError("Deriving a DataSet from a Subset is unsupported")
+        self._set = iter_set
+        self._dim = utils.as_tuple(dim, numbers.Integral)
+        self._cdim = np.prod(self._dim).item()
+        self._name = name or "dset_#x%x" % id(self)
+        self._initialized = True
+
+    @classmethod
+    def _process_args(cls, *args, **kwargs):
+        return (args[0], ) + args, kwargs
+
+    @classmethod
+    def _cache_key(cls, iter_set, dim=1, name=None):
+        return (iter_set, utils.as_tuple(dim, numbers.Integral))
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.dim, self._set._wrapper_cache_key_)
+
+    def __getstate__(self):
+        """Extract state to pickle."""
+        return self.__dict__
+
+    def __setstate__(self, d):
+        """Restore from pickled state."""
+        self.__dict__.update(d)
+
+    # Look up any unspecified attributes on the _set.
+    def __getattr__(self, name):
+        """Returns a Set specific attribute."""
+        value = getattr(self.set, name)
+        setattr(self, name, value)
+        return value
+
+    def __getitem__(self, idx):
+        """Allow index to return self"""
+        assert idx == 0
+        return self
+
+    @utils.cached_property
+    def dim(self):
+        """The shape tuple of the values for each element of the set."""
+        return self._dim
+
+    @utils.cached_property
+    def cdim(self):
+        """The scalar number of values for each member of the set. This is
+        the product of the dim tuple."""
+        return self._cdim
+
+    @utils.cached_property
+    def name(self):
+        """Returns the name of the data set."""
+        return self._name
+
+    @utils.cached_property
+    def set(self):
+        """Returns the parent set of the data set."""
+        return self._set
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __str__(self):
+        return "OP2 DataSet: %s on set %s, with dim %s" % \
+            (self._name, self._set, self._dim)
+
+    def __repr__(self):
+        return "DataSet(%r, %r, %r)" % (self._set, self._dim, self._name)
+
+    def __contains__(self, dat):
+        """Indicate whether a given Dat is compatible with this DataSet."""
+        return dat.dataset == self
+
+    @utils.cached_property
+    def lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`DataSet`.
+        """
+        lgmap = PETSc.LGMap()
+        if self.comm.size == 1:
+            lgmap.create(indices=np.arange(self.size, dtype=dtypes.IntType),
+                         bsize=self.cdim, comm=self.comm)
+        else:
+            lgmap.create(indices=self.halo.local_to_global_numbering,
+                         bsize=self.cdim, comm=self.comm)
+        return lgmap
+
+    @utils.cached_property
+    def scalar_lgmap(self):
+        if self.cdim == 1:
+            return self.lgmap
+        indices = self.lgmap.block_indices
+        return PETSc.LGMap().create(indices=indices, bsize=1, comm=self.comm)
+
+    @utils.cached_property
+    def unblocked_lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`DataSet` with a block size of 1.
+        """
+        if self.cdim == 1:
+            return self.lgmap
+        else:
+            indices = self.lgmap.indices
+            lgmap = PETSc.LGMap().create(indices=indices,
+                                         bsize=1, comm=self.lgmap.comm)
+            return lgmap
+
+    @utils.cached_property
+    def field_ises(self):
+        """A list of PETSc ISes defining the global indices for each set in
+        the DataSet.
+
+        Used when extracting blocks from matrices for solvers."""
+        ises = []
+        nlocal_rows = 0
+        for dset in self:
+            nlocal_rows += dset.size * dset.cdim
+        offset = self.comm.scan(nlocal_rows)
+        offset -= nlocal_rows
+        for dset in self:
+            nrows = dset.size * dset.cdim
+            iset = PETSc.IS().createStride(nrows, first=offset, step=1,
+                                           comm=self.comm)
+            iset.setBlockSize(dset.cdim)
+            ises.append(iset)
+            offset += nrows
+        return tuple(ises)
+
+    @utils.cached_property
+    def local_ises(self):
+        """A list of PETSc ISes defining the local indices for each set in the DataSet.
+
+        Used when extracting blocks from matrices for assembly."""
+        ises = []
+        start = 0
+        for dset in self:
+            bs = dset.cdim
+            n = dset.total_size*bs
+            iset = PETSc.IS().createStride(n, first=start, step=1,
+                                           comm=mpi.COMM_SELF)
+            iset.setBlockSize(bs)
+            start += n
+            ises.append(iset)
+        return tuple(ises)
+
+    @utils.cached_property
+    def layout_vec(self):
+        """A PETSc Vec compatible with the dof layout of this DataSet."""
+        vec = PETSc.Vec().create(comm=self.comm)
+        size = (self.size * self.cdim, None)
+        vec.setSizes(size, bsize=self.cdim)
+        vec.setUp()
+        return vec
+
+    @utils.cached_property
+    def dm(self):
+        dm = PETSc.DMShell().create(comm=self.comm)
+        dm.setGlobalVector(self.layout_vec)
+        return dm
+
+
+class GlobalDataSet(DataSet):
+    """A proxy :class:`DataSet` for use in a :class:`Sparsity` where the
+    matrix has :class:`Global` rows or columns."""
+
+    def __init__(self, global_):
+        """
+        :param global_: The :class:`Global` on which this object is based."""
+
+        self._global = global_
+        self._globalset = GlobalSet(comm=self.comm)
+        self._name = "gdset_#x%x" % id(self)
+
+    @classmethod
+    def _cache_key(cls, *args):
+        return None
+
+    @utils.cached_property
+    def dim(self):
+        """The shape tuple of the values for each element of the set."""
+        return self._global._dim
+
+    @utils.cached_property
+    def cdim(self):
+        """The scalar number of values for each member of the set. This is
+        the product of the dim tuple."""
+        return self._global._cdim
+
+    @utils.cached_property
+    def name(self):
+        """Returns the name of the data set."""
+        return self._global._name
+
+    @utils.cached_property
+    def comm(self):
+        """Return the communicator on which the set is defined."""
+        return self._global.comm
+
+    @utils.cached_property
+    def set(self):
+        """Returns the parent set of the data set."""
+        return self._globalset
+
+    @utils.cached_property
+    def size(self):
+        """The number of local entries in the Dataset (1 on rank 0)"""
+        return 1 if mpi.MPI.comm.rank == 0 else 0
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __str__(self):
+        return "OP2 GlobalDataSet: %s on Global %s" % \
+            (self._name, self._global)
+
+    def __repr__(self):
+        return "GlobalDataSet(%r)" % (self._global)
+
+    @utils.cached_property
+    def lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`DataSet`.
+        """
+        lgmap = PETSc.LGMap()
+        lgmap.create(indices=np.arange(1, dtype=dtypes.IntType),
+                     bsize=self.cdim, comm=self.comm)
+        return lgmap
+
+    @utils.cached_property
+    def unblocked_lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`DataSet` with a block size of 1.
+        """
+        if self.cdim == 1:
+            return self.lgmap
+        else:
+            indices = self.lgmap.indices
+            lgmap = PETSc.LGMap().create(indices=indices,
+                                         bsize=1, comm=self.lgmap.comm)
+            return lgmap
+
+    @utils.cached_property
+    def field_ises(self):
+        """A list of PETSc ISes defining the global indices for each set in
+        the DataSet.
+
+        Used when extracting blocks from matrices for solvers."""
+        ises = []
+        nlocal_rows = 0
+        for dset in self:
+            nlocal_rows += dset.size * dset.cdim
+        offset = self.comm.scan(nlocal_rows)
+        offset -= nlocal_rows
+        for dset in self:
+            nrows = dset.size * dset.cdim
+            iset = PETSc.IS().createStride(nrows, first=offset, step=1,
+                                           comm=self.comm)
+            iset.setBlockSize(dset.cdim)
+            ises.append(iset)
+            offset += nrows
+        return tuple(ises)
+
+    @utils.cached_property
+    def local_ises(self):
+        """A list of PETSc ISes defining the local indices for each set in the DataSet.
+
+        Used when extracting blocks from matrices for assembly."""
+        raise NotImplementedError
+
+    @utils.cached_property
+    def layout_vec(self):
+        """A PETSc Vec compatible with the dof layout of this DataSet."""
+        vec = PETSc.Vec().create(comm=self.comm)
+        size = (self.size * self.cdim, None)
+        vec.setSizes(size, bsize=self.cdim)
+        vec.setUp()
+        return vec
+
+    @utils.cached_property
+    def dm(self):
+        dm = PETSc.DMShell().create(comm=self.comm)
+        dm.setGlobalVector(self.layout_vec)
+        return dm
+
+
+class MixedDataSet(DataSet):
+    r"""A container for a bag of :class:`DataSet`\s.
+
+    Initialized either from a :class:`MixedSet` and an iterable or iterator of
+    ``dims`` of corresponding length ::
+
+        mdset = op2.MixedDataSet(mset, [dim1, ..., dimN])
+
+    or from a tuple of :class:`Set`\s and an iterable of ``dims`` of
+    corresponding length ::
+
+        mdset = op2.MixedDataSet([set1, ..., setN], [dim1, ..., dimN])
+
+    If all ``dims`` are to be the same, they can also be given as an
+    :class:`int` for either of above invocations ::
+
+        mdset = op2.MixedDataSet(mset, dim)
+        mdset = op2.MixedDataSet([set1, ..., setN], dim)
+
+    Initialized from a :class:`MixedSet` without explicitly specifying ``dims``
+    they default to 1 ::
+
+        mdset = op2.MixedDataSet(mset)
+
+    Initialized from an iterable or iterator of :class:`DataSet`\s and/or
+    :class:`Set`\s, where :class:`Set`\s are implicitly upcast to
+    :class:`DataSet`\s of dim 1 ::
+
+        mdset = op2.MixedDataSet([dset1, ..., dsetN])
+    """
+
+    def __init__(self, arg, dims=None):
+        r"""
+        :param arg:  a :class:`MixedSet` or an iterable or a generator
+                     expression of :class:`Set`\s or :class:`DataSet`\s or a
+                     mixture of both
+        :param dims: `None` (the default) or an :class:`int` or an iterable or
+                     generator expression of :class:`int`\s, which **must** be
+                     of same length as `arg`
+
+        .. Warning ::
+            When using generator expressions for ``arg`` or ``dims``, these
+            **must** terminate or else will cause an infinite loop.
+        """
+        if self._initialized:
+            return
+        self._dsets = arg
+        self._initialized = True
+
+    @classmethod
+    def _process_args(cls, arg, dims=None):
+        # If the second argument is not None it is expect to be a scalar dim
+        # or an iterable of dims and the first is expected to be a MixedSet or
+        # an iterable of Sets
+        if dims is not None:
+            # If arg is a MixedSet, get its Sets tuple
+            sets = arg.split if isinstance(arg, MixedSet) else tuple(arg)
+            # If dims is a scalar, turn it into a tuple of right length
+            dims = (dims,) * len(sets) if isinstance(dims, int) else tuple(dims)
+            if len(sets) != len(dims):
+                raise ValueError("Got MixedSet of %d Sets but %s dims" %
+                                 (len(sets), len(dims)))
+            dsets = tuple(s ** d for s, d in zip(sets, dims))
+        # Otherwise expect the first argument to be an iterable of Sets and/or
+        # DataSets and upcast Sets to DataSets as necessary
+        else:
+            arg = [s if isinstance(s, DataSet) else s ** 1 for s in arg]
+            dsets = utils.as_tuple(arg, type=DataSet)
+
+        return (dsets[0].set, ) + (dsets, ), {}
+
+    @classmethod
+    def _cache_key(cls, arg, dims=None):
+        return arg
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        raise NotImplementedError
+
+    def __getitem__(self, idx):
+        """Return :class:`DataSet` with index ``idx`` or a given slice of datasets."""
+        return self._dsets[idx]
+
+    @utils.cached_property
+    def split(self):
+        r"""The underlying tuple of :class:`DataSet`\s."""
+        return self._dsets
+
+    @utils.cached_property
+    def dim(self):
+        """The shape tuple of the values for each element of the sets."""
+        return tuple(s.dim for s in self._dsets)
+
+    @utils.cached_property
+    def cdim(self):
+        """The sum of the scalar number of values for each member of the sets.
+        This is the sum of products of the dim tuples."""
+        return sum(s.cdim for s in self._dsets)
+
+    @utils.cached_property
+    def name(self):
+        """Returns the name of the data sets."""
+        return tuple(s.name for s in self._dsets)
+
+    @utils.cached_property
+    def set(self):
+        """Returns the :class:`MixedSet` this :class:`MixedDataSet` is
+        defined on."""
+        return MixedSet(s.set for s in self._dsets)
+
+    def __iter__(self):
+        r"""Yield all :class:`DataSet`\s when iterated over."""
+        for ds in self._dsets:
+            yield ds
+
+    def __len__(self):
+        """Return number of contained :class:`DataSet`s."""
+        return len(self._dsets)
+
+    def __str__(self):
+        return "OP2 MixedDataSet composed of DataSets: %s" % (self._dsets,)
+
+    def __repr__(self):
+        return "MixedDataSet(%r)" % (self._dsets,)
+
+    @utils.cached_property
+    def layout_vec(self):
+        """A PETSc Vec compatible with the dof layout of this MixedDataSet."""
+        vec = PETSc.Vec().create(comm=self.comm)
+        # Compute local and global size from sizes of layout vecs
+        lsize, gsize = map(sum, zip(*(d.layout_vec.sizes for d in self)))
+        vec.setSizes((lsize, gsize), bsize=1)
+        vec.setUp()
+        return vec
+
+    @utils.cached_property
+    def lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`MixedDataSet`.
+        """
+        lgmap = PETSc.LGMap()
+        if self.comm.size == 1:
+            size = sum(s.size * s.cdim for s in self)
+            lgmap.create(indices=np.arange(size, dtype=dtypes.IntType),
+                         bsize=1, comm=self.comm)
+            return lgmap
+        # Compute local to global maps for a monolithic mixed system
+        # from the individual local to global maps for each field.
+        # Exposition:
+        #
+        # We have N fields and P processes.  The global row
+        # ordering is:
+        #
+        # f_0_p_0, f_1_p_0, ..., f_N_p_0; f_0_p_1, ..., ; f_0_p_P,
+        # ..., f_N_p_P.
+        #
+        # We have per-field local to global numberings, to convert
+        # these into multi-field local to global numberings, we note
+        # the following:
+        #
+        # For each entry in the per-field l2g map, we first determine
+        # the rank that entry belongs to, call this r.
+        #
+        # We know that this must be offset by:
+        # 1. The sum of all field lengths with rank < r
+        # 2. The sum of all lower-numbered field lengths on rank r.
+        #
+        # Finally, we need to shift the field-local entry by the
+        # current field offset.
+        idx_size = sum(s.total_size*s.cdim for s in self)
+        indices = np.full(idx_size, -1, dtype=dtypes.IntType)
+        owned_sz = np.array([sum(s.size * s.cdim for s in self)],
+                            dtype=dtypes.IntType)
+        field_offset = np.empty_like(owned_sz)
+        self.comm.Scan(owned_sz, field_offset)
+        field_offset -= owned_sz
+
+        all_field_offsets = np.empty(self.comm.size, dtype=dtypes.IntType)
+        self.comm.Allgather(field_offset, all_field_offsets)
+
+        start = 0
+        all_local_offsets = np.zeros(self.comm.size, dtype=dtypes.IntType)
+        current_offsets = np.zeros(self.comm.size + 1, dtype=dtypes.IntType)
+        for s in self:
+            idx = indices[start:start + s.total_size * s.cdim]
+            owned_sz[0] = s.size * s.cdim
+            self.comm.Scan(owned_sz, field_offset)
+            self.comm.Allgather(field_offset, current_offsets[1:])
+            # Find the ranks each entry in the l2g belongs to
+            l2g = s.unblocked_lgmap.indices
+            tmp_indices = np.searchsorted(current_offsets, l2g, side="right") - 1
+            idx[:] = l2g[:] - current_offsets[tmp_indices] + \
+                all_field_offsets[tmp_indices] + all_local_offsets[tmp_indices]
+            self.comm.Allgather(owned_sz, current_offsets[1:])
+            all_local_offsets += current_offsets[1:]
+            start += s.total_size * s.cdim
+        lgmap.create(indices=indices, bsize=1, comm=self.comm)
+        return lgmap
+
+    @utils.cached_property
+    def unblocked_lgmap(self):
+        """A PETSc LGMap mapping process-local indices to global
+        indices for this :class:`DataSet` with a block size of 1.
+        """
+        return self.lgmap
diff --git a/pyop2/types/glob.py b/pyop2/types/glob.py
new file mode 100644
index 000000000..5651db693
--- /dev/null
+++ b/pyop2/types/glob.py
@@ -0,0 +1,290 @@
+from contextlib import contextmanager
+import ctypes
+import operator
+
+import numpy as np
+from petsc4py import PETSc
+
+from pyop2 import (
+    exceptions as ex,
+    mpi,
+    utils
+)
+from pyop2.types.access import Access
+from pyop2.types.dataset import GlobalDataSet
+from pyop2.types.data_carrier import DataCarrier, EmptyDataMixin, VecAccessMixin
+
+
+class Global(DataCarrier, EmptyDataMixin, VecAccessMixin):
+
+    """OP2 global value.
+
+    When a ``Global`` is passed to a :func:`pyop2.op2.par_loop`, the access
+    descriptor is passed by `calling` the ``Global``.  For example, if
+    a ``Global`` named ``G`` is to be accessed for reading, this is
+    accomplished by::
+
+      G(pyop2.READ)
+
+    It is permissible to pass `None` as the `data` argument.  In this
+    case, allocation of the data buffer is postponed until it is
+    accessed.
+
+    .. note::
+        If the data buffer is not passed in, it is implicitly
+        initialised to be zero.
+    """
+
+    _modes = [Access.READ, Access.INC, Access.MIN, Access.MAX]
+
+    @utils.validate_type(('name', str, ex.NameTypeError))
+    def __init__(self, dim, data=None, dtype=None, name=None, comm=None):
+        if isinstance(dim, Global):
+            # If g is a Global, Global(g) performs a deep copy. This is for compatibility with Dat.
+            self.__init__(dim._dim, None, dtype=dim.dtype,
+                          name="copy_of_%s" % dim.name, comm=dim.comm)
+            dim.copy(self)
+            return
+        self._dim = utils.as_tuple(dim, int)
+        self._cdim = np.prod(self._dim).item()
+        EmptyDataMixin.__init__(self, data, dtype, self._dim)
+        self._buf = np.empty(self.shape, dtype=self.dtype)
+        self._name = name or "global_#x%x" % id(self)
+        self.comm = comm
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return (self._data.ctypes.data, )
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return (ctypes.c_voidp, )
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.dtype, self.shape)
+
+    @utils.validate_in(('access', _modes, ex.ModeValueError))
+    def __call__(self, access, path=None):
+        from parloop import Arg
+        return Arg(data=self, access=access)
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __getitem__(self, idx):
+        """Return self if ``idx`` is 0, raise an error otherwise."""
+        if idx != 0:
+            raise ex.IndexValueError("Can only extract component 0 from %r" % self)
+        return self
+
+    def __str__(self):
+        return "OP2 Global Argument: %s with dim %s and value %s" \
+            % (self._name, self._dim, self._data)
+
+    def __repr__(self):
+        return "Global(%r, %r, %r, %r)" % (self._dim, self._data,
+                                           self._data.dtype, self._name)
+
+    @utils.cached_property
+    def dataset(self):
+        return GlobalDataSet(self)
+
+    @property
+    def shape(self):
+        return self._dim
+
+    @property
+    def data(self):
+        """Data array."""
+        if len(self._data) == 0:
+            raise RuntimeError("Illegal access: No data associated with this Global!")
+        return self._data
+
+    @property
+    def dtype(self):
+        return self._dtype
+
+    @property
+    def data_ro(self):
+        """Data array."""
+        view = self.data.view()
+        view.setflags(write=False)
+        return view
+
+    @data.setter
+    def data(self, value):
+        self._data[:] = utils.verify_reshape(value, self.dtype, self.dim)
+
+    @property
+    def nbytes(self):
+        """Return an estimate of the size of the data associated with this
+        :class:`Global` in bytes. This will be the correct size of the
+        data payload, but does not take into account the overhead of
+        the object and its metadata. This renders this method of
+        little statistical significance, however it is included to
+        make the interface consistent.
+        """
+
+        return self.dtype.itemsize * self._cdim
+
+    @mpi.collective
+    def duplicate(self):
+        """Return a deep copy of self."""
+        return type(self)(self.dim, data=np.copy(self.data_ro),
+                          dtype=self.dtype, name=self.name)
+
+    @mpi.collective
+    def copy(self, other, subset=None):
+        """Copy the data in this :class:`Global` into another.
+
+        :arg other: The destination :class:`Global`
+        :arg subset: A :class:`Subset` of elements to copy (optional)"""
+
+        other.data = np.copy(self.data_ro)
+
+    @mpi.collective
+    def zero(self):
+        self._data[...] = 0
+
+    @mpi.collective
+    def global_to_local_begin(self, access_mode):
+        """Dummy halo operation for the case in which a :class:`Global` forms
+        part of a :class:`MixedDat`."""
+        pass
+
+    @mpi.collective
+    def global_to_local_end(self, access_mode):
+        """Dummy halo operation for the case in which a :class:`Global` forms
+        part of a :class:`MixedDat`."""
+        pass
+
+    @mpi.collective
+    def local_to_global_begin(self, insert_mode):
+        """Dummy halo operation for the case in which a :class:`Global` forms
+        part of a :class:`MixedDat`."""
+        pass
+
+    @mpi.collective
+    def local_to_global_end(self, insert_mode):
+        """Dummy halo operation for the case in which a :class:`Global` forms
+        part of a :class:`MixedDat`."""
+        pass
+
+    def _op(self, other, op):
+        ret = type(self)(self.dim, dtype=self.dtype, name=self.name, comm=self.comm)
+        if isinstance(other, Global):
+            ret.data[:] = op(self.data_ro, other.data_ro)
+        else:
+            ret.data[:] = op(self.data_ro, other)
+        return ret
+
+    def _iop(self, other, op):
+        if isinstance(other, Global):
+            op(self.data[:], other.data_ro)
+        else:
+            op(self.data[:], other)
+        return self
+
+    def __pos__(self):
+        return self.duplicate()
+
+    def __add__(self, other):
+        """Pointwise addition of fields."""
+        return self._op(other, operator.add)
+
+    def __radd__(self, other):
+        """Pointwise addition of fields.
+
+        self.__radd__(other) <==> other + self."""
+        return self + other
+
+    def __neg__(self):
+        return type(self)(self.dim, data=-np.copy(self.data_ro),
+                          dtype=self.dtype, name=self.name)
+
+    def __sub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._op(other, operator.sub)
+
+    def __rsub__(self, other):
+        """Pointwise subtraction of fields.
+
+        self.__rsub__(other) <==> other - self."""
+        ret = -self
+        ret += other
+        return ret
+
+    def __mul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._op(other, operator.mul)
+
+    def __rmul__(self, other):
+        """Pointwise multiplication or scaling of fields.
+
+        self.__rmul__(other) <==> other * self."""
+        return self.__mul__(other)
+
+    def __truediv__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._op(other, operator.truediv)
+
+    def __iadd__(self, other):
+        """Pointwise addition of fields."""
+        return self._iop(other, operator.iadd)
+
+    def __isub__(self, other):
+        """Pointwise subtraction of fields."""
+        return self._iop(other, operator.isub)
+
+    def __imul__(self, other):
+        """Pointwise multiplication or scaling of fields."""
+        return self._iop(other, operator.imul)
+
+    def __itruediv__(self, other):
+        """Pointwise division or scaling of fields."""
+        return self._iop(other, operator.itruediv)
+
+    def inner(self, other):
+        assert isinstance(other, Global)
+        return np.dot(self.data_ro, np.conj(other.data_ro))
+
+    @utils.cached_property
+    def _vec(self):
+        assert self.dtype == PETSc.ScalarType, \
+            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
+        # Can't duplicate layout_vec of dataset, because we then
+        # carry around extra unnecessary data.
+        # But use getSizes to save an Allreduce in computing the
+        # global size.
+        data = self._data
+        size = self.dataset.layout_vec.getSizes()
+        if self.comm.rank == 0:
+            return PETSc.Vec().createWithArray(data, size=size,
+                                               bsize=self.cdim,
+                                               comm=self.comm)
+        else:
+            return PETSc.Vec().createWithArray(np.empty(0, dtype=self.dtype),
+                                               size=size,
+                                               bsize=self.cdim,
+                                               comm=self.comm)
+
+    @contextmanager
+    def vec_context(self, access):
+        """A context manager for a :class:`PETSc.Vec` from a :class:`Global`.
+
+        :param access: Access descriptor: READ, WRITE, or RW."""
+        # PETSc Vecs have a state counter and cache norm computations
+        # to return immediately if the state counter is unchanged.
+        # Since we've updated the data behind their back, we need to
+        # change that state counter.
+        self._vec.stateIncrease()
+        yield self._vec
+        if access is not Access.READ:
+            data = self._data
+            self.comm.Bcast(data, 0)
diff --git a/pyop2/types/halo.py b/pyop2/types/halo.py
new file mode 100644
index 000000000..6b69e686f
--- /dev/null
+++ b/pyop2/types/halo.py
@@ -0,0 +1,56 @@
+import abc
+
+
+class Halo(abc.ABC):
+
+    """A description of a halo associated with a :class:`Set`.
+
+    The halo object describes which :class:`Set` elements are sent
+    where, and which :class:`Set` elements are received from where.
+    """
+
+    @abc.abstractproperty
+    def comm(self):
+        """The MPI communicator for this halo."""
+        pass
+
+    @abc.abstractproperty
+    def local_to_global_numbering(self):
+        """The mapping from process-local to process-global numbers for this halo."""
+        pass
+
+    @abc.abstractmethod
+    def global_to_local_begin(self, dat, insert_mode):
+        """Begin an exchange from global (assembled) to local (ghosted) representation.
+
+        :arg dat: The :class:`Dat` to exchange.
+        :arg insert_mode: The insertion mode.
+        """
+        pass
+
+    @abc.abstractmethod
+    def global_to_local_end(self, dat, insert_mode):
+        """Finish an exchange from global (assembled) to local (ghosted) representation.
+
+        :arg dat: The :class:`Dat` to exchange.
+        :arg insert_mode: The insertion mode.
+        """
+        pass
+
+    @abc.abstractmethod
+    def local_to_global_begin(self, dat, insert_mode):
+        """Begin an exchange from local (ghosted) to global (assembled) representation.
+
+        :arg dat: The :class:`Dat` to exchange.
+        :arg insert_mode: The insertion mode.
+        """
+        pass
+
+    @abc.abstractmethod
+    def local_to_global_end(self, dat, insert_mode):
+        """Finish an exchange from local (ghosted) to global (assembled) representation.
+
+        :arg dat: The :class:`Dat` to exchange.
+        :arg insert_mode: The insertion mode.
+        """
+        pass
diff --git a/pyop2/types/map.py b/pyop2/types/map.py
new file mode 100644
index 000000000..ce4843a6c
--- /dev/null
+++ b/pyop2/types/map.py
@@ -0,0 +1,305 @@
+import ctypes
+import itertools
+import functools
+import numbers
+
+import numpy as np
+
+from pyop2 import (
+    caching,
+    datatypes as dtypes,
+    exceptions as ex,
+    utils
+)
+from pyop2.types.set import GlobalSet, MixedSet, Set
+
+
+class Map:
+
+    """OP2 map, a relation between two :class:`Set` objects.
+
+    Each entry in the ``iterset`` maps to ``arity`` entries in the
+    ``toset``. When a map is used in a :func:`pyop2.op2.par_loop`, it is
+    possible to use Python index notation to select an individual entry on the
+    right hand side of this map. There are three possibilities:
+
+    * No index. All ``arity`` :class:`Dat` entries will be passed to the
+      kernel.
+    * An integer: ``some_map[n]``. The ``n`` th entry of the
+      map result will be passed to the kernel.
+    """
+
+    dtype = dtypes.IntType
+
+    @utils.validate_type(('iterset', Set, ex.SetTypeError), ('toset', Set, ex.SetTypeError),
+                         ('arity', numbers.Integral, ex.ArityTypeError), ('name', str, ex.NameTypeError))
+    def __init__(self, iterset, toset, arity, values=None, name=None, offset=None):
+        self._iterset = iterset
+        self._toset = toset
+        self.comm = toset.comm
+        self._arity = arity
+        self._values = utils.verify_reshape(values, dtypes.IntType,
+                                            (iterset.total_size, arity), allow_none=True)
+        self.shape = (iterset.total_size, arity)
+        self._name = name or "map_#x%x" % id(self)
+        if offset is None or len(offset) == 0:
+            self._offset = None
+        else:
+            self._offset = utils.verify_reshape(offset, dtypes.IntType, (arity, ))
+        # A cache for objects built on top of this map
+        self._cache = {}
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return (self._values.ctypes.data, )
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return (ctypes.c_voidp, )
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.arity, utils.tuplify(self.offset))
+
+    # This is necessary so that we can convert a Map to a tuple
+    # (needed in as_tuple).  Because, __getitem__ no longer returns a
+    # Map we have to explicitly provide an iterable interface
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    @utils.cached_property
+    def split(self):
+        return (self,)
+
+    @utils.cached_property
+    def iterset(self):
+        """:class:`Set` mapped from."""
+        return self._iterset
+
+    @utils.cached_property
+    def toset(self):
+        """:class:`Set` mapped to."""
+        return self._toset
+
+    @utils.cached_property
+    def arity(self):
+        """Arity of the mapping: number of toset elements mapped to per
+        iterset element."""
+        return self._arity
+
+    @utils.cached_property
+    def arities(self):
+        """Arity of the mapping: number of toset elements mapped to per
+        iterset element.
+
+        :rtype: tuple"""
+        return (self._arity,)
+
+    @utils.cached_property
+    def arange(self):
+        """Tuple of arity offsets for each constituent :class:`Map`."""
+        return (0, self._arity)
+
+    @utils.cached_property
+    def values(self):
+        """Mapping array.
+
+        This only returns the map values for local points, to see the
+        halo points too, use :meth:`values_with_halo`."""
+        return self._values[:self.iterset.size]
+
+    @utils.cached_property
+    def values_with_halo(self):
+        """Mapping array.
+
+        This returns all map values (including halo points), see
+        :meth:`values` if you only need to look at the local
+        points."""
+        return self._values
+
+    @utils.cached_property
+    def name(self):
+        """User-defined label"""
+        return self._name
+
+    @utils.cached_property
+    def offset(self):
+        """The vertical offset."""
+        return self._offset
+
+    def __str__(self):
+        return "OP2 Map: %s from (%s) to (%s) with arity %s" \
+               % (self._name, self._iterset, self._toset, self._arity)
+
+    def __repr__(self):
+        return "Map(%r, %r, %r, None, %r)" \
+               % (self._iterset, self._toset, self._arity, self._name)
+
+    def __le__(self, o):
+        """self<=o if o equals self or self._parent <= o."""
+        return self == o
+
+
+class PermutedMap(Map):
+    """Composition of a standard :class:`Map` with a constant permutation.
+
+    :arg map_: The map to permute.
+    :arg permutation: The permutation of the map indices.
+
+    Where normally staging to element data is performed as
+
+    .. code-block::
+
+       local[i] = global[map[i]]
+
+    With a :class:`PermutedMap` we instead get
+
+    .. code-block::
+
+       local[i] = global[map[permutation[i]]]
+
+    This might be useful if your local kernel wants data in a
+    different order to the one that the map provides, and you don't
+    want two global-sized data structures.
+    """
+    def __init__(self, map_, permutation):
+        self.map_ = map_
+        self.permutation = np.asarray(permutation, dtype=Map.dtype)
+        assert (np.unique(permutation) == np.arange(map_.arity, dtype=Map.dtype)).all()
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return super()._wrapper_cache_key_ + (tuple(self.permutation),)
+
+    def __getattr__(self, name):
+        return getattr(self.map_, name)
+
+
+class MixedMap(Map, caching.ObjectCached):
+    r"""A container for a bag of :class:`Map`\s."""
+
+    def __init__(self, maps):
+        r""":param iterable maps: Iterable of :class:`Map`\s"""
+        if self._initialized:
+            return
+        self._maps = maps
+        if not all(m is None or m.iterset == self.iterset for m in self._maps):
+            raise ex.MapTypeError("All maps in a MixedMap need to share the same iterset")
+        # TODO: Think about different communicators on maps (c.f. MixedSet)
+        # TODO: What if all maps are None?
+        comms = tuple(m.comm for m in self._maps if m is not None)
+        if not all(c == comms[0] for c in comms):
+            raise ex.MapTypeError("All maps needs to share a communicator")
+        if len(comms) == 0:
+            raise ex.MapTypeError("Don't know how to make communicator")
+        self.comm = comms[0]
+        self._initialized = True
+
+    @classmethod
+    def _process_args(cls, *args, **kwargs):
+        maps = utils.as_tuple(args[0], type=Map, allow_none=True)
+        cache = maps[0]
+        return (cache, ) + (maps, ), kwargs
+
+    @classmethod
+    def _cache_key(cls, maps):
+        return maps
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return tuple(itertools.chain(*(m._kernel_args_ for m in self if m is not None)))
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return tuple(itertools.chain(*(m._argtypes_ for m in self if m is not None)))
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return tuple(m._wrapper_cache_key_ for m in self if m is not None)
+
+    @utils.cached_property
+    def split(self):
+        r"""The underlying tuple of :class:`Map`\s."""
+        return self._maps
+
+    @utils.cached_property
+    def iterset(self):
+        """:class:`MixedSet` mapped from."""
+        return functools.reduce(lambda a, b: a or b, map(lambda s: s if s is None else s.iterset, self._maps))
+
+    @utils.cached_property
+    def toset(self):
+        """:class:`MixedSet` mapped to."""
+        return MixedSet(tuple(GlobalSet(comm=self.comm) if m is None else
+                              m.toset for m in self._maps))
+
+    @utils.cached_property
+    def arity(self):
+        """Arity of the mapping: total number of toset elements mapped to per
+        iterset element."""
+        return sum(m.arity for m in self._maps)
+
+    @utils.cached_property
+    def arities(self):
+        """Arity of the mapping: number of toset elements mapped to per
+        iterset element.
+
+        :rtype: tuple"""
+        return tuple(m.arity for m in self._maps)
+
+    @utils.cached_property
+    def arange(self):
+        """Tuple of arity offsets for each constituent :class:`Map`."""
+        return (0,) + tuple(np.cumsum(self.arities))
+
+    @utils.cached_property
+    def values(self):
+        """Mapping arrays excluding data for halos.
+
+        This only returns the map values for local points, to see the
+        halo points too, use :meth:`values_with_halo`."""
+        return tuple(m.values for m in self._maps)
+
+    @utils.cached_property
+    def values_with_halo(self):
+        """Mapping arrays including data for halos.
+
+        This returns all map values (including halo points), see
+        :meth:`values` if you only need to look at the local
+        points."""
+        return tuple(None if m is None else
+                     m.values_with_halo for m in self._maps)
+
+    @utils.cached_property
+    def name(self):
+        """User-defined labels"""
+        return tuple(m.name for m in self._maps)
+
+    @utils.cached_property
+    def offset(self):
+        """Vertical offsets."""
+        return tuple(0 if m is None else m.offset for m in self._maps)
+
+    def __iter__(self):
+        r"""Yield all :class:`Map`\s when iterated over."""
+        for m in self._maps:
+            yield m
+
+    def __len__(self):
+        r"""Number of contained :class:`Map`\s."""
+        return len(self._maps)
+
+    def __le__(self, o):
+        """self<=o if o equals self or its self._parent==o."""
+        return self == o or all(m <= om for m, om in zip(self, o))
+
+    def __str__(self):
+        return "OP2 MixedMap composed of Maps: %s" % (self._maps,)
+
+    def __repr__(self):
+        return "MixedMap(%r)" % (self._maps,)
diff --git a/pyop2/petsc_base.py b/pyop2/types/mat.py
similarity index 51%
rename from pyop2/petsc_base.py
rename to pyop2/types/mat.py
index ef38b3aa3..2ffdae6ff 100644
--- a/pyop2/petsc_base.py
+++ b/pyop2/types/mat.py
@@ -1,443 +1,357 @@
-# This file is part of PyOP2
-#
-# PyOP2 is Copyright (c) 2012, Imperial College London and
-# others. Please see the AUTHORS file in the main source directory for
-# a full list of copyright holders.  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * The name of Imperial College London or that of other
-#       contributors may not be used to endorse or promote products
-#       derived from this software without specific prior written
-#       permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
-# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-# OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from contextlib import contextmanager
-from petsc4py import PETSc
+import abc
+import ctypes
 import itertools
+
 import numpy as np
-import abc
+from petsc4py import PETSc
 
-from pyop2.datatypes import IntType, ScalarType
-from pyop2 import base
-from pyop2 import mpi
-from pyop2 import sparsity
-from pyop2 import utils
-from pyop2.base import _make_object, Subset
-from pyop2.mpi import collective
-from pyop2.profiling import timed_region
+from pyop2 import (
+    caching,
+    configuration as conf,
+    datatypes as dtypes,
+    exceptions as ex,
+    mpi,
+    profiling,
+    sparsity,
+    utils
+)
+from pyop2.types.access import Access
+from pyop2.types.data_carrier import DataCarrier
+from pyop2.types.dataset import DataSet, GlobalDataSet, MixedDataSet
+from pyop2.types.map import Map
+from pyop2.types.set import MixedSet, Set, Subset
 
 
-class DataSet(base.DataSet):
+class Sparsity(caching.ObjectCached):
 
-    @utils.cached_property
-    def lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`DataSet`.
-        """
-        lgmap = PETSc.LGMap()
-        if self.comm.size == 1:
-            lgmap.create(indices=np.arange(self.size, dtype=IntType),
-                         bsize=self.cdim, comm=self.comm)
-        else:
-            lgmap.create(indices=self.halo.local_to_global_numbering,
-                         bsize=self.cdim, comm=self.comm)
-        return lgmap
+    """OP2 Sparsity, the non-zero structure a matrix derived from the union of
+    the outer product of pairs of :class:`Map` objects.
 
-    @utils.cached_property
-    def scalar_lgmap(self):
-        if self.cdim == 1:
-            return self.lgmap
-        indices = self.lgmap.block_indices
-        return PETSc.LGMap().create(indices=indices, bsize=1, comm=self.comm)
+    Examples of constructing a Sparsity: ::
 
-    @utils.cached_property
-    def unblocked_lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`DataSet` with a block size of 1.
+        Sparsity(single_dset, single_map, 'mass')
+        Sparsity((row_dset, col_dset), (single_rowmap, single_colmap))
+        Sparsity((row_dset, col_dset),
+                 [(first_rowmap, first_colmap), (second_rowmap, second_colmap)])
+
+    .. _MatMPIAIJSetPreallocation: http://www.mcs.anl.gov/petsc/petsc-current/docs/manualpages/Mat/MatMPIAIJSetPreallocation.html
+    """
+
+    def __init__(self, dsets, maps, *, iteration_regions=None, name=None, nest=None, block_sparse=None):
+        r"""
+        :param dsets: :class:`DataSet`\s for the left and right function
+            spaces this :class:`Sparsity` maps between
+        :param maps: :class:`Map`\s to build the :class:`Sparsity` from
+        :type maps: a pair of :class:`Map`\s specifying a row map and a column
+            map, or an iterable of pairs of :class:`Map`\s specifying multiple
+            row and column maps - if a single :class:`Map` is passed, it is
+            used as both a row map and a column map
+        :param iteration_regions: regions that select subsets of extruded maps to iterate over.
+        :param string name: user-defined label (optional)
+        :param nest: Should the sparsity over mixed set be built as nested blocks?
+        :param block_sparse: Should the sparsity for datasets with
+            cdim > 1 be built as a block sparsity?
         """
-        if self.cdim == 1:
-            return self.lgmap
+        # Protect against re-initialization when retrieved from cache
+        if self._initialized:
+            return
+
+        self._block_sparse = block_sparse
+        # Split into a list of row maps and a list of column maps
+        maps, iteration_regions = zip(*maps)
+        self._rmaps, self._cmaps = zip(*maps)
+        self._dsets = dsets
+
+        if isinstance(dsets[0], GlobalDataSet) or isinstance(dsets[1], GlobalDataSet):
+            self._dims = (((1, 1),),)
+            self._d_nnz = None
+            self._o_nnz = None
+            self._nrows = None if isinstance(dsets[0], GlobalDataSet) else self._rmaps[0].toset.size
+            self._ncols = None if isinstance(dsets[1], GlobalDataSet) else self._cmaps[0].toset.size
+            self.lcomm = dsets[0].comm if isinstance(dsets[0], GlobalDataSet) else self._rmaps[0].comm
+            self.rcomm = dsets[1].comm if isinstance(dsets[1], GlobalDataSet) else self._cmaps[0].comm
         else:
-            indices = self.lgmap.indices
-            lgmap = PETSc.LGMap().create(indices=indices,
-                                         bsize=1, comm=self.lgmap.comm)
-            return lgmap
+            self.lcomm = self._rmaps[0].comm
+            self.rcomm = self._cmaps[0].comm
 
-    @utils.cached_property
-    def field_ises(self):
-        """A list of PETSc ISes defining the global indices for each set in
-        the DataSet.
-
-        Used when extracting blocks from matrices for solvers."""
-        ises = []
-        nlocal_rows = 0
-        for dset in self:
-            nlocal_rows += dset.size * dset.cdim
-        offset = self.comm.scan(nlocal_rows)
-        offset -= nlocal_rows
-        for dset in self:
-            nrows = dset.size * dset.cdim
-            iset = PETSc.IS().createStride(nrows, first=offset, step=1,
-                                           comm=self.comm)
-            iset.setBlockSize(dset.cdim)
-            ises.append(iset)
-            offset += nrows
-        return tuple(ises)
+            rset, cset = self.dsets
+            # All rmaps and cmaps have the same data set - just use the first.
+            self._nrows = rset.size
+            self._ncols = cset.size
 
-    @utils.cached_property
-    def local_ises(self):
-        """A list of PETSc ISes defining the local indices for each set in the DataSet.
-
-        Used when extracting blocks from matrices for assembly."""
-        ises = []
-        start = 0
-        for dset in self:
-            bs = dset.cdim
-            n = dset.total_size*bs
-            iset = PETSc.IS().createStride(n, first=start, step=1,
-                                           comm=mpi.COMM_SELF)
-            iset.setBlockSize(bs)
-            start += n
-            ises.append(iset)
-        return tuple(ises)
+            self._has_diagonal = (rset == cset)
 
-    @utils.cached_property
-    def layout_vec(self):
-        """A PETSc Vec compatible with the dof layout of this DataSet."""
-        vec = PETSc.Vec().create(comm=self.comm)
-        size = (self.size * self.cdim, None)
-        vec.setSizes(size, bsize=self.cdim)
-        vec.setUp()
-        return vec
+            tmp = itertools.product([x.cdim for x in self._dsets[0]],
+                                    [x.cdim for x in self._dsets[1]])
 
-    @utils.cached_property
-    def dm(self):
-        dm = PETSc.DMShell().create(comm=self.comm)
-        dm.setGlobalVector(self.layout_vec)
-        return dm
+            dims = [[None for _ in range(self.shape[1])] for _ in range(self.shape[0])]
+            for r in range(self.shape[0]):
+                for c in range(self.shape[1]):
+                    dims[r][c] = next(tmp)
 
+            self._dims = tuple(tuple(d) for d in dims)
 
-class GlobalDataSet(base.GlobalDataSet):
+        if self.lcomm != self.rcomm:
+            raise ValueError("Haven't thought hard enough about different left and right communicators")
+        self.comm = self.lcomm
 
-    @utils.cached_property
-    def lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`DataSet`.
-        """
-        lgmap = PETSc.LGMap()
-        lgmap.create(indices=np.arange(1, dtype=IntType),
-                     bsize=self.cdim, comm=self.comm)
-        return lgmap
+        self._name = name or "sparsity_#x%x" % id(self)
+
+        self.iteration_regions = iteration_regions
+        # If the Sparsity is defined on MixedDataSets, we need to build each
+        # block separately
+        if (isinstance(dsets[0], MixedDataSet) or isinstance(dsets[1], MixedDataSet)) \
+           and nest:
+            self._nested = True
+            self._blocks = []
+            for i, rds in enumerate(dsets[0]):
+                row = []
+                for j, cds in enumerate(dsets[1]):
+                    row.append(Sparsity((rds, cds), [(rm.split[i], cm.split[j]) for
+                                                     rm, cm in maps],
+                                        iteration_regions=iteration_regions,
+                                        block_sparse=block_sparse))
+                self._blocks.append(row)
+            self._d_nnz = tuple(s._d_nnz for s in self)
+            self._o_nnz = tuple(s._o_nnz for s in self)
+        elif isinstance(dsets[0], GlobalDataSet) or isinstance(dsets[1], GlobalDataSet):
+            # Where the sparsity maps either from or to a Global, we
+            # don't really have any sparsity structure.
+            self._blocks = [[self]]
+            self._nested = False
+        else:
+            for dset in dsets:
+                if isinstance(dset, MixedDataSet) and any([isinstance(d, GlobalDataSet) for d in dset]):
+                    raise ex.SparsityFormatError("Mixed monolithic matrices with Global rows or columns are not supported.")
+            self._nested = False
+            with profiling.timed_region("CreateSparsity"):
+                nnz, onnz = sparsity.build_sparsity(self)
+                self._d_nnz = nnz
+                self._o_nnz = onnz
+            self._blocks = [[self]]
+        self._initialized = True
+
+    _cache = {}
 
-    @utils.cached_property
-    def unblocked_lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`DataSet` with a block size of 1.
-        """
-        if self.cdim == 1:
-            return self.lgmap
+    @classmethod
+    @utils.validate_type(('dsets', (Set, DataSet, tuple, list), ex.DataSetTypeError),
+                         ('maps', (Map, tuple, list), ex.MapTypeError))
+    def _process_args(cls, dsets, maps, *, iteration_regions=None, name=None, nest=None, block_sparse=None):
+        "Turn maps argument into a canonical tuple of pairs."
+        from pyop2.parloop import IterationRegion
+
+        # A single data set becomes a pair of identical data sets
+        dsets = [dsets, dsets] if isinstance(dsets, (Set, DataSet)) else list(dsets)
+        # Upcast Sets to DataSets
+        dsets = [s ** 1 if isinstance(s, Set) else s for s in dsets]
+
+        # Check data sets are valid
+        for dset in dsets:
+            if not isinstance(dset, DataSet) and dset is not None:
+                raise ex.DataSetTypeError("All data sets must be of type DataSet, not type %r" % type(dset))
+
+        # A single map becomes a pair of identical maps
+        maps = (maps, maps) if isinstance(maps, Map) else maps
+        # A single pair becomes a tuple of one pair
+        maps = (maps,) if isinstance(maps[0], Map) else maps
+
+        # Check maps are sane
+        for pair in maps:
+            if pair[0] is None or pair[1] is None:
+                # None of this checking makes sense if one of the
+                # matrix operands is a Global.
+                continue
+            for m in pair:
+                if not isinstance(m, Map):
+                    raise ex.MapTypeError(
+                        "All maps must be of type map, not type %r" % type(m))
+                if len(m.values_with_halo) == 0 and m.iterset.total_size > 0:
+                    raise ex.MapValueError(
+                        "Unpopulated map values when trying to build sparsity.")
+            # Make sure that the "to" Set of each map in a pair is the set of
+            # the corresponding DataSet set
+            if not (pair[0].toset == dsets[0].set
+                    and pair[1].toset == dsets[1].set):
+                raise RuntimeError("Map to set must be the same as corresponding DataSet set")
+
+            # Each pair of maps must have the same from-set (iteration set)
+            if not pair[0].iterset == pair[1].iterset:
+                raise RuntimeError("Iterset of both maps in a pair must be the same")
+
+        rmaps, cmaps = zip(*maps)
+        if iteration_regions is None:
+            iteration_regions = tuple((IterationRegion.ALL, ) for _ in maps)
+        else:
+            iteration_regions = tuple(tuple(sorted(region)) for region in iteration_regions)
+        if not len(rmaps) == len(cmaps):
+            raise RuntimeError("Must pass equal number of row and column maps")
+
+        if rmaps[0] is not None and cmaps[0] is not None:
+            # Each row map must have the same to-set (data set)
+            if not all(m.toset == rmaps[0].toset for m in rmaps):
+                raise RuntimeError("To set of all row maps must be the same")
+
+                # Each column map must have the same to-set (data set)
+            if not all(m.toset == cmaps[0].toset for m in cmaps):
+                raise RuntimeError("To set of all column maps must be the same")
+
+        # Need to return the caching object, a tuple of the processed
+        # arguments and a dict of kwargs (empty in this case)
+        if isinstance(dsets[0], GlobalDataSet):
+            cache = None
+        elif isinstance(dsets[0].set, MixedSet):
+            cache = dsets[0].set[0]
         else:
-            indices = self.lgmap.indices
-            lgmap = PETSc.LGMap().create(indices=indices,
-                                         bsize=1, comm=self.lgmap.comm)
-            return lgmap
+            cache = dsets[0].set
+        if nest is None:
+            nest = conf.configuration["matnest"]
+        if block_sparse is None:
+            block_sparse = conf.configuration["block_sparsity"]
+
+        maps = frozenset(zip(maps, iteration_regions))
+        kwargs = {"name": name,
+                  "nest": nest,
+                  "block_sparse": block_sparse}
+        return (cache,) + (tuple(dsets), maps), kwargs
+
+    @classmethod
+    def _cache_key(cls, dsets, maps, name, nest, block_sparse, *args, **kwargs):
+        return (dsets, maps, nest, block_sparse)
+
+    def __getitem__(self, idx):
+        """Return :class:`Sparsity` block with row and column given by ``idx``
+        or a given row of blocks."""
+        try:
+            i, j = idx
+            return self._blocks[i][j]
+        except TypeError:
+            return self._blocks[idx]
 
     @utils.cached_property
-    def field_ises(self):
-        """A list of PETSc ISes defining the global indices for each set in
-        the DataSet.
-
-        Used when extracting blocks from matrices for solvers."""
-        ises = []
-        nlocal_rows = 0
-        for dset in self:
-            nlocal_rows += dset.size * dset.cdim
-        offset = self.comm.scan(nlocal_rows)
-        offset -= nlocal_rows
-        for dset in self:
-            nrows = dset.size * dset.cdim
-            iset = PETSc.IS().createStride(nrows, first=offset, step=1,
-                                           comm=self.comm)
-            iset.setBlockSize(dset.cdim)
-            ises.append(iset)
-            offset += nrows
-        return tuple(ises)
+    def dsets(self):
+        r"""A pair of :class:`DataSet`\s for the left and right function
+        spaces this :class:`Sparsity` maps between."""
+        return self._dsets
 
     @utils.cached_property
-    def local_ises(self):
-        """A list of PETSc ISes defining the local indices for each set in the DataSet.
+    def maps(self):
+        """A list of pairs (rmap, cmap) where each pair of
+        :class:`Map` objects will later be used to assemble into this
+        matrix. The iterset of each of the maps in a pair must be the
+        same, while the toset of all the maps which appear first
+        must be common, this will form the row :class:`Set` of the
+        sparsity. Similarly, the toset of all the maps which appear
+        second must be common and will form the column :class:`Set` of
+        the ``Sparsity``."""
+        return list(zip(self._rmaps, self._cmaps))
 
-        Used when extracting blocks from matrices for assembly."""
-        raise NotImplementedError
+    @utils.cached_property
+    def cmaps(self):
+        """The list of column maps this sparsity is assembled from."""
+        return self._cmaps
 
     @utils.cached_property
-    def layout_vec(self):
-        """A PETSc Vec compatible with the dof layout of this DataSet."""
-        vec = PETSc.Vec().create(comm=self.comm)
-        size = (self.size * self.cdim, None)
-        vec.setSizes(size, bsize=self.cdim)
-        vec.setUp()
-        return vec
+    def rmaps(self):
+        """The list of row maps this sparsity is assembled from."""
+        return self._rmaps
 
     @utils.cached_property
-    def dm(self):
-        dm = PETSc.DMShell().create(comm=self.comm)
-        dm.setGlobalVector(self.layout_vec)
-        return dm
+    def dims(self):
+        """A tuple of tuples where the ``i,j``th entry
+        is a pair giving the number of rows per entry of the row
+        :class:`Set` and the number of columns per entry of the column
+        :class:`Set` of the ``Sparsity``.  The extents of the first
+        two indices are given by the :attr:`shape` of the sparsity.
+        """
+        return self._dims
 
+    @utils.cached_property
+    def shape(self):
+        """Number of block rows and columns."""
+        return (len(self._dsets[0] or [1]),
+                len(self._dsets[1] or [1]))
 
-class MixedDataSet(DataSet, base.MixedDataSet):
+    @utils.cached_property
+    def nrows(self):
+        """The number of rows in the ``Sparsity``."""
+        return self._nrows
 
     @utils.cached_property
-    def layout_vec(self):
-        """A PETSc Vec compatible with the dof layout of this MixedDataSet."""
-        vec = PETSc.Vec().create(comm=self.comm)
-        # Compute local and global size from sizes of layout vecs
-        lsize, gsize = map(sum, zip(*(d.layout_vec.sizes for d in self)))
-        vec.setSizes((lsize, gsize), bsize=1)
-        vec.setUp()
-        return vec
+    def ncols(self):
+        """The number of columns in the ``Sparsity``."""
+        return self._ncols
 
     @utils.cached_property
-    def lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`MixedDataSet`.
+    def nested(self):
+        r"""Whether a sparsity is monolithic (even if it has a block structure).
+
+        To elaborate, if a sparsity maps between
+        :class:`MixedDataSet`\s, it can either be nested, in which
+        case it consists of as many blocks are the product of the
+        length of the datasets it maps between, or monolithic.  In the
+        latter case the sparsity is for the full map between the mixed
+        datasets, rather than between the blocks of the non-mixed
+        datasets underneath them.
         """
-        lgmap = PETSc.LGMap()
-        if self.comm.size == 1:
-            size = sum(s.size * s.cdim for s in self)
-            lgmap.create(indices=np.arange(size, dtype=IntType),
-                         bsize=1, comm=self.comm)
-            return lgmap
-        # Compute local to global maps for a monolithic mixed system
-        # from the individual local to global maps for each field.
-        # Exposition:
-        #
-        # We have N fields and P processes.  The global row
-        # ordering is:
-        #
-        # f_0_p_0, f_1_p_0, ..., f_N_p_0; f_0_p_1, ..., ; f_0_p_P,
-        # ..., f_N_p_P.
-        #
-        # We have per-field local to global numberings, to convert
-        # these into multi-field local to global numberings, we note
-        # the following:
-        #
-        # For each entry in the per-field l2g map, we first determine
-        # the rank that entry belongs to, call this r.
-        #
-        # We know that this must be offset by:
-        # 1. The sum of all field lengths with rank < r
-        # 2. The sum of all lower-numbered field lengths on rank r.
-        #
-        # Finally, we need to shift the field-local entry by the
-        # current field offset.
-        idx_size = sum(s.total_size*s.cdim for s in self)
-        indices = np.full(idx_size, -1, dtype=IntType)
-        owned_sz = np.array([sum(s.size * s.cdim for s in self)],
-                            dtype=IntType)
-        field_offset = np.empty_like(owned_sz)
-        self.comm.Scan(owned_sz, field_offset)
-        field_offset -= owned_sz
-
-        all_field_offsets = np.empty(self.comm.size, dtype=IntType)
-        self.comm.Allgather(field_offset, all_field_offsets)
-
-        start = 0
-        all_local_offsets = np.zeros(self.comm.size, dtype=IntType)
-        current_offsets = np.zeros(self.comm.size + 1, dtype=IntType)
-        for s in self:
-            idx = indices[start:start + s.total_size * s.cdim]
-            owned_sz[0] = s.size * s.cdim
-            self.comm.Scan(owned_sz, field_offset)
-            self.comm.Allgather(field_offset, current_offsets[1:])
-            # Find the ranks each entry in the l2g belongs to
-            l2g = s.unblocked_lgmap.indices
-            tmp_indices = np.searchsorted(current_offsets, l2g, side="right") - 1
-            idx[:] = l2g[:] - current_offsets[tmp_indices] + \
-                all_field_offsets[tmp_indices] + all_local_offsets[tmp_indices]
-            self.comm.Allgather(owned_sz, current_offsets[1:])
-            all_local_offsets += current_offsets[1:]
-            start += s.total_size * s.cdim
-        lgmap.create(indices=indices, bsize=1, comm=self.comm)
-        return lgmap
+        return self._nested
 
     @utils.cached_property
-    def unblocked_lgmap(self):
-        """A PETSc LGMap mapping process-local indices to global
-        indices for this :class:`DataSet` with a block size of 1.
-        """
-        return self.lgmap
+    def name(self):
+        """A user-defined label."""
+        return self._name
 
+    def __iter__(self):
+        r"""Iterate over all :class:`Sparsity`\s by row and then by column."""
+        for row in self._blocks:
+            for s in row:
+                yield s
 
-class VecAccessMixin(metaclass=abc.ABCMeta):
-    @abc.abstractmethod
-    def vec_context(self, access):
-        pass
+    def __str__(self):
+        return "OP2 Sparsity: dsets %s, rmaps %s, cmaps %s, name %s" % \
+               (self._dsets, self._rmaps, self._cmaps, self._name)
 
-    @abc.abstractproperty
-    def _vec(self):
-        pass
+    def __repr__(self):
+        return "Sparsity(%r, %r, %r)" % (self.dsets, self.maps, self.name)
 
-    @property
-    @collective
-    def vec(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
+    @utils.cached_property
+    def nnz(self):
+        """Array containing the number of non-zeroes in the various rows of the
+        diagonal portion of the local submatrix.
 
-        You're allowed to modify the data you get back from this view."""
-        return self.vec_context(access=base.RW)
+        This is the same as the parameter `d_nnz` used for preallocation in
+        PETSc's MatMPIAIJSetPreallocation_."""
+        return self._d_nnz
 
-    @property
-    @collective
-    def vec_wo(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
+    @utils.cached_property
+    def onnz(self):
+        """Array containing the number of non-zeroes in the various rows of the
+        off-diagonal portion of the local submatrix.
 
-        You're allowed to modify the data you get back from this view,
-        but you cannot read from it."""
-        return self.vec_context(access=base.WRITE)
+        This is the same as the parameter `o_nnz` used for preallocation in
+        PETSc's MatMPIAIJSetPreallocation_."""
+        return self._o_nnz
 
-    @property
-    @collective
-    def vec_ro(self):
-        """Context manager for a PETSc Vec appropriate for this Dat.
+    @utils.cached_property
+    def nz(self):
+        return self._d_nnz.sum()
 
-        You're not allowed to modify the data you get back from this view."""
-        return self.vec_context(access=base.READ)
+    @utils.cached_property
+    def onz(self):
+        return self._o_nnz.sum()
 
+    def __contains__(self, other):
+        """Return true if other is a pair of maps in self.maps(). This
+        will also return true if the elements of other have parents in
+        self.maps()."""
 
-class Dat(base.Dat, VecAccessMixin):
-    @utils.cached_property
-    def _vec(self):
-        assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
-        # Can't duplicate layout_vec of dataset, because we then
-        # carry around extra unnecessary data.
-        # But use getSizes to save an Allreduce in computing the
-        # global size.
-        size = self.dataset.layout_vec.getSizes()
-        data = self._data[:size[0]]
-        return PETSc.Vec().createWithArray(data, size=size, bsize=self.cdim, comm=self.comm)
-
-    @contextmanager
-    def vec_context(self, access):
-        r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`.
-
-        :param access: Access descriptor: READ, WRITE, or RW."""
-        # PETSc Vecs have a state counter and cache norm computations
-        # to return immediately if the state counter is unchanged.
-        # Since we've updated the data behind their back, we need to
-        # change that state counter.
-        self._vec.stateIncrease()
-        yield self._vec
-        if access is not base.READ:
-            self.halo_valid = False
-
-
-class MixedDat(base.MixedDat, VecAccessMixin):
-    @utils.cached_property
-    def _vec(self):
-        assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
-        # In this case we can just duplicate the layout vec
-        # because we're not placing an array.
-        return self.dataset.layout_vec.duplicate()
-
-    @contextmanager
-    def vec_context(self, access):
-        r"""A context manager scattering the arrays of all components of this
-        :class:`MixedDat` into a contiguous :class:`PETSc.Vec` and reverse
-        scattering to the original arrays when exiting the context.
-
-        :param access: Access descriptor: READ, WRITE, or RW.
-
-        .. note::
-
-           The :class:`~PETSc.Vec` obtained from this context is in
-           the correct order to be left multiplied by a compatible
-           :class:`MixedMat`.  In parallel it is *not* just a
-           concatenation of the underlying :class:`Dat`\s."""
-        # Do the actual forward scatter to fill the full vector with
-        # values
-        if access is not base.WRITE:
-            offset = 0
-            array = self._vec.array
-            for d in self:
-                with d.vec_ro as v:
-                    size = v.local_size
-                    array[offset:offset+size] = v.array_r[:]
-                    offset += size
-        self._vec.stateIncrease()
-        yield self._vec
-        if access is not base.READ:
-            # Reverse scatter to get the values back to their original locations
-            offset = 0
-            array = self._vec.array_r
-            for d in self:
-                with d.vec_wo as v:
-                    size = v.local_size
-                    v.array[:] = array[offset:offset+size]
-                    offset += size
-            self.halo_valid = False
-
-
-class Global(base.Global, VecAccessMixin):
-    @utils.cached_property
-    def _vec(self):
-        assert self.dtype == PETSc.ScalarType, \
-            "Can't create Vec with type %s, must be %s" % (self.dtype, PETSc.ScalarType)
-        # Can't duplicate layout_vec of dataset, because we then
-        # carry around extra unnecessary data.
-        # But use getSizes to save an Allreduce in computing the
-        # global size.
-        data = self._data
-        size = self.dataset.layout_vec.getSizes()
-        if self.comm.rank == 0:
-            return PETSc.Vec().createWithArray(data, size=size,
-                                               bsize=self.cdim,
-                                               comm=self.comm)
-        else:
-            return PETSc.Vec().createWithArray(np.empty(0, dtype=self.dtype),
-                                               size=size,
-                                               bsize=self.cdim,
-                                               comm=self.comm)
-
-    @contextmanager
-    def vec_context(self, access):
-        """A context manager for a :class:`PETSc.Vec` from a :class:`Global`.
-
-        :param access: Access descriptor: READ, WRITE, or RW."""
-        # PETSc Vecs have a state counter and cache norm computations
-        # to return immediately if the state counter is unchanged.
-        # Since we've updated the data behind their back, we need to
-        # change that state counter.
-        self._vec.stateIncrease()
-        yield self._vec
-        if access is not base.READ:
-            data = self._data
-            self.comm.Bcast(data, 0)
-
-
-class SparsityBlock(base.Sparsity):
+        for maps in self.maps:
+            if tuple(other) <= maps:
+                return True
+
+        return False
+
+
+class SparsityBlock(Sparsity):
     """A proxy class for a block in a monolithic :class:`.Sparsity`.
 
     :arg parent: The parent monolithic sparsity.
@@ -487,119 +401,216 @@ def masked_lgmap(lgmap, mask, block=True):
     return PETSc.LGMap().create(indices=indices, bsize=bsize, comm=lgmap.comm)
 
 
-class MatBlock(base.Mat):
-    """A proxy class for a local block in a monolithic :class:`.Mat`.
+class AbstractMat(DataCarrier, abc.ABC):
+    r"""OP2 matrix data. A ``Mat`` is defined on a sparsity pattern and holds a value
+    for each element in the :class:`Sparsity`.
 
-    :arg parent: The parent monolithic matrix.
-    :arg i: The block row.
-    :arg j: The block column.
-    """
-    def __init__(self, parent, i, j):
-        self._parent = parent
-        self._i = i
-        self._j = j
-        self._sparsity = SparsityBlock(parent.sparsity, i, j)
-        rset, cset = self._parent.sparsity.dsets
-        rowis = rset.local_ises[i]
-        colis = cset.local_ises[j]
-        self.handle = parent.handle.getLocalSubMatrix(isrow=rowis,
-                                                      iscol=colis)
-        self.comm = parent.comm
-        self.local_to_global_maps = self.handle.getLGMap()
+    When a ``Mat`` is passed to :func:`pyop2.op2.par_loop`, the maps via which
+    indirection occurs for the row and column space, and the access
+    descriptor are passed by `calling` the ``Mat``. For instance, if a
+    ``Mat`` named ``A`` is to be accessed for reading via a row :class:`Map`
+    named ``R`` and a column :class:`Map` named ``C``, this is accomplished by::
 
-    @utils.cached_property
-    def _kernel_args_(self):
-        return (self.handle.handle, )
+     A(pyop2.READ, (R[pyop2.i[0]], C[pyop2.i[1]]))
 
-    @utils.cached_property
-    def _wrapper_cache_key_(self):
-        return (type(self._parent), self._parent.dtype, self.dims)
+    Notice that it is `always` necessary to index the indirection maps
+    for a ``Mat``. See the :class:`Mat` documentation for more
+    details.
 
-    @property
-    def assembly_state(self):
-        # Track our assembly state only
-        return self._parent.assembly_state
+    .. note ::
 
-    @assembly_state.setter
-    def assembly_state(self, value):
-        self._parent.assembly_state = value
+       After executing :func:`par_loop`\s that write to a ``Mat`` and
+       before using it (for example to view its values), you must call
+       :meth:`assemble` to finalise the writes.
+    """
+    @utils.cached_property
+    def pack(self):
+        from pyop2.codegen.builder import MatPack
+        return MatPack
+
+    ASSEMBLED = "ASSEMBLED"
+    INSERT_VALUES = "INSERT_VALUES"
+    ADD_VALUES = "ADD_VALUES"
+
+    _modes = [Access.WRITE, Access.INC]
+
+    @utils.validate_type(('sparsity', Sparsity, ex.SparsityTypeError),
+                         ('name', str, ex.NameTypeError))
+    def __init__(self, sparsity, dtype=None, name=None):
+        self._sparsity = sparsity
+        self.lcomm = sparsity.lcomm
+        self.rcomm = sparsity.rcomm
+        self.comm = sparsity.comm
+        dtype = dtype or dtypes.ScalarType
+        self._datatype = np.dtype(dtype)
+        self._name = name or "mat_#x%x" % id(self)
+        self.assembly_state = Mat.ASSEMBLED
 
-    def __getitem__(self, idx):
-        return self
+    @utils.validate_in(('access', _modes, ex.ModeValueError))
+    def __call__(self, access, path, lgmaps=None, unroll_map=False):
+        from pyop2.parloop import Arg
+        path_maps = utils.as_tuple(path, Map, 2)
+        if conf.configuration["type_check"] and tuple(path_maps) not in self.sparsity:
+            raise ex.MapValueError("Path maps not in sparsity maps")
+        return Arg(data=self, map=path_maps, access=access, lgmaps=lgmaps, unroll_map=unroll_map)
 
-    def __iter__(self):
-        yield self
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), self.dtype, self.dims)
 
-    def _flush_assembly(self):
-        # Need to flush for all blocks
-        for b in self._parent:
-            b.handle.assemble(assembly=PETSc.Mat.AssemblyType.FLUSH)
-        self._parent._flush_assembly()
+    def assemble(self):
+        """Finalise this :class:`Mat` ready for use.
 
-    def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
-        rows = np.asarray(rows, dtype=IntType)
-        rbs, _ = self.dims[0][0]
-        if rbs > 1:
-            if idx is not None:
-                rows = rbs * rows + idx
-            else:
-                rows = np.dstack([rbs*rows + i for i in range(rbs)]).flatten()
-        rows = rows.reshape(-1, 1)
-        self.change_assembly_state(Mat.INSERT_VALUES)
-        if len(rows) > 0:
-            values = np.full(rows.shape, diag_val, dtype=ScalarType)
-            self.handle.setValuesLocalRCV(rows, rows, values,
-                                          addv=PETSc.InsertMode.INSERT_VALUES)
+        Call this /after/ executing all the par_loops that write to
+        the matrix before you want to look at it.
+        """
+        raise NotImplementedError("Subclass should implement this")
 
     def addto_values(self, rows, cols, values):
         """Add a block of values to the :class:`Mat`."""
-        self.change_assembly_state(Mat.ADD_VALUES)
-        if len(values) > 0:
-            self.handle.setValuesBlockedLocal(rows, cols, values,
-                                              addv=PETSc.InsertMode.ADD_VALUES)
+        raise NotImplementedError(
+            "Abstract Mat base class doesn't know how to set values.")
 
     def set_values(self, rows, cols, values):
         """Set a block of values in the :class:`Mat`."""
-        self.change_assembly_state(Mat.INSERT_VALUES)
-        if len(values) > 0:
-            self.handle.setValuesBlockedLocal(rows, cols, values,
-                                              addv=PETSc.InsertMode.INSERT_VALUES)
+        raise NotImplementedError(
+            "Abstract Mat base class doesn't know how to set values.")
 
-    def assemble(self):
-        raise RuntimeError("Should never call assemble on MatBlock")
+    @utils.cached_property
+    def nblocks(self):
+        return int(np.prod(self.sparsity.shape))
+
+    @utils.cached_property
+    def _argtypes_(self):
+        """Ctypes argtype for this :class:`Mat`"""
+        return tuple(ctypes.c_voidp for _ in self)
+
+    @utils.cached_property
+    def dims(self):
+        """A pair of integers giving the number of matrix rows and columns for
+        each member of the row :class:`Set` and column :class:`Set`
+        respectively. This corresponds to the ``cdim`` member of a
+        :class:`DataSet`."""
+        return self._sparsity._dims
+
+    @utils.cached_property
+    def nrows(self):
+        "The number of rows in the matrix (local to this process)"
+        return sum(d.size * d.cdim for d in self.sparsity.dsets[0])
+
+    @utils.cached_property
+    def nblock_rows(self):
+        """The number "block" rows in the matrix (local to this process).
+
+        This is equivalent to the number of rows in the matrix divided
+        by the dimension of the row :class:`DataSet`.
+        """
+        assert len(self.sparsity.dsets[0]) == 1, "Block rows don't make sense for mixed Mats"
+        return self.sparsity.dsets[0].size
+
+    @utils.cached_property
+    def nblock_cols(self):
+        """The number of "block" columns in the matrix (local to this process).
+
+        This is equivalent to the number of columns in the matrix
+        divided by the dimension of the column :class:`DataSet`.
+        """
+        assert len(self.sparsity.dsets[1]) == 1, "Block cols don't make sense for mixed Mats"
+        return self.sparsity.dsets[1].size
+
+    @utils.cached_property
+    def ncols(self):
+        "The number of columns in the matrix (local to this process)"
+        return sum(d.size * d.cdim for d in self.sparsity.dsets[1])
+
+    @utils.cached_property
+    def sparsity(self):
+        """:class:`Sparsity` on which the ``Mat`` is defined."""
+        return self._sparsity
+
+    @utils.cached_property
+    def _is_scalar_field(self):
+        # Sparsity from Dat to MixedDat has a shape like (1, (1, 1))
+        # (which you can't take the product of)
+        return all(np.prod(d) == 1 for d in self.dims)
+
+    @utils.cached_property
+    def _is_vector_field(self):
+        return not self._is_scalar_field
+
+    def change_assembly_state(self, new_state):
+        """Switch the matrix assembly state."""
+        if new_state == Mat.ASSEMBLED or self.assembly_state == Mat.ASSEMBLED:
+            self.assembly_state = new_state
+        elif new_state != self.assembly_state:
+            self._flush_assembly()
+            self.assembly_state = new_state
+        else:
+            pass
+
+    def _flush_assembly(self):
+        """Flush the in flight assembly operations (used when
+        switching between inserting and adding values)."""
+        pass
 
     @property
     def values(self):
-        rset, cset = self._parent.sparsity.dsets
-        rowis = rset.field_ises[self._i]
-        colis = cset.field_ises[self._j]
-        self._parent.assemble()
-        mat = self._parent.handle.createSubMatrix(isrow=rowis,
-                                                  iscol=colis)
-        return mat[:, :]
+        """A numpy array of matrix values.
 
-    @property
+        .. warning ::
+            This is a dense array, so will need a lot of memory.  It's
+            probably not a good idea to access this property if your
+            matrix has more than around 10000 degrees of freedom.
+        """
+        raise NotImplementedError("Abstract base Mat does not implement values()")
+
+    @utils.cached_property
     def dtype(self):
-        return self._parent.dtype
+        """The Python type of the data."""
+        return self._datatype
 
-    @property
+    @utils.cached_property
     def nbytes(self):
-        return self._parent.nbytes // (np.prod(self.sparsity.shape))
+        """Return an estimate of the size of the data associated with this
+        :class:`Mat` in bytes. This will be the correct size of the
+        data payload, but does not take into account the (presumably
+        small) overhead of the object and its metadata. The memory
+        associated with the sparsity pattern is also not recorded.
+
+        Note that this is the process local memory usage, not the sum
+        over all MPI processes.
+        """
+        if self._sparsity._block_sparse:
+            mult = np.sum(np.prod(self._sparsity.dims))
+        else:
+            mult = 1
+        return (self._sparsity.nz + self._sparsity.onz) \
+            * self.dtype.itemsize * mult
 
-    def __repr__(self):
-        return "MatBlock(%r, %r, %r)" % (self._parent, self._i, self._j)
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __mul__(self, other):
+        """Multiply this :class:`Mat` with the vector ``other``."""
+        raise NotImplementedError("Abstract base Mat does not implement multiplication")
 
     def __str__(self):
-        return "Block[%s, %s] of %s" % (self._i, self._j, self._parent)
+        return "OP2 Mat: %s, sparsity (%s), datatype %s" \
+               % (self._name, self._sparsity, self._datatype.name)
+
+    def __repr__(self):
+        return "Mat(%r, %r, %r)" \
+               % (self._sparsity, self._datatype, self._name)
 
 
-class Mat(base.Mat):
+class Mat(AbstractMat):
     """OP2 matrix data. A Mat is defined on a sparsity pattern and holds a value
     for each element in the :class:`Sparsity`."""
 
     def __init__(self, *args, **kwargs):
         self.mat_type = kwargs.pop("mat_type", None)
-        base.Mat.__init__(self, *args, **kwargs)
+        super().__init__(*args, **kwargs)
         self._init()
         self.assembly_state = Mat.ASSEMBLED
 
@@ -610,7 +621,7 @@ def __init__(self, *args, **kwargs):
     def _kernel_args_(self):
         return tuple(a.handle.handle for a in self)
 
-    @collective
+    @mpi.collective
     def _init(self):
         if not self.dtype == PETSc.ScalarType:
             raise RuntimeError("Can only create a matrix of type %s, %s is not supported"
@@ -648,7 +659,7 @@ def _init_dense(self):
         mat.setOption(mat.Option.SUBSET_OFF_PROC_ENTRIES, True)
         mat.setUp()
         # Put zeros in all the places we might eventually put a value.
-        with timed_region("MatZeroInitial"):
+        with profiling.timed_region("MatZeroInitial"):
             mat.zeroEntries()
         mat.assemble()
 
@@ -680,7 +691,7 @@ def _init_monolithic(self):
         # The first assembly (filling with zeros) sets all possible entries.
         mat.setOption(mat.Option.SUBSET_OFF_PROC_ENTRIES, True)
         # Put zeros in all the places we might eventually put a value.
-        with timed_region("MatZeroInitial"):
+        with profiling.timed_region("MatZeroInitial"):
             for i in range(rows):
                 for j in range(cols):
                     sparsity.fill_with_zeros(self[i, j].handle,
@@ -757,7 +768,7 @@ def _init_block(self):
         # entries, so raise an error if we "missed" one.
         mat.setOption(mat.Option.UNUSED_NONZERO_LOCATION_ERR, True)
         # Put zeros in all the places we might eventually put a value.
-        with timed_region("MatZeroInitial"):
+        with profiling.timed_region("MatZeroInitial"):
             sparsity.fill_with_zeros(mat, self.sparsity.dims[0][0],
                                      self.sparsity.maps, self.sparsity.iteration_regions,
                                      set_diag=self.sparsity._has_diagonal)
@@ -783,17 +794,15 @@ def _init_global_block(self):
     def __call__(self, access, path, lgmaps=None, unroll_map=False):
         """Override the parent __call__ method in order to special-case global
         blocks in matrices."""
+        from pyop2.parloop import Arg
         # One of the path entries was not an Arg.
         if path == (None, None):
             lgmaps, = lgmaps
             assert all(l is None for l in lgmaps)
-            return _make_object('Arg',
-                                data=self.handle.getPythonContext().global_,
-                                access=access)
+            return Arg(data=self.handle.getPythonContext().global_, access=access)
         elif None in path:
             thispath = path[0] or path[1]
-            return _make_object('Arg', data=self.handle.getPythonContext().dat,
-                                map=thispath, access=access)
+            return Arg(data=self.handle.getPythonContext().dat, map=thispath, access=access)
         else:
             return super().__call__(access, path, lgmaps=lgmaps, unroll_map=unroll_map)
 
@@ -810,13 +819,13 @@ def __iter__(self):
         """Iterate over all :class:`Mat` blocks by row and then by column."""
         yield from itertools.chain(*self.blocks)
 
-    @collective
+    @mpi.collective
     def zero(self):
         """Zero the matrix."""
         self.assemble()
         self.handle.zeroEntries()
 
-    @collective
+    @mpi.collective
     def zero_rows(self, rows, diag_val=1.0):
         """Zeroes the specified rows of the matrix, with the exception of the
         diagonal entry, which is set to diag_val. May be used for applying
@@ -830,7 +839,7 @@ def zero_rows(self, rows, diag_val=1.0):
     def _flush_assembly(self):
         self.handle.assemble(assembly=PETSc.Mat.AssemblyType.FLUSH)
 
-    @collective
+    @mpi.collective
     def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
         """Set the diagonal entry in ``rows`` to a particular value.
 
@@ -840,7 +849,7 @@ def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
         The indices in ``rows`` should index the process-local rows of
         the matrix (no mapping to global indexes is applied).
         """
-        rows = np.asarray(rows, dtype=IntType)
+        rows = np.asarray(rows, dtype=dtypes.IntType)
         rbs, _ = self.dims[0][0]
         if rbs > 1:
             if idx is not None:
@@ -850,11 +859,11 @@ def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
         rows = rows.reshape(-1, 1)
         self.change_assembly_state(Mat.INSERT_VALUES)
         if len(rows) > 0:
-            values = np.full(rows.shape, diag_val, dtype=ScalarType)
+            values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
             self.handle.setValuesLocalRCV(rows, rows, values,
                                           addv=PETSc.InsertMode.INSERT_VALUES)
 
-    @collective
+    @mpi.collective
     def assemble(self):
         # If the matrix is nested, we need to check each subblock to
         # see if it needs assembling.  But if it's monolithic then the
@@ -902,10 +911,110 @@ def values(self):
             return self.handle[:, :]
 
 
-class ParLoop(base.ParLoop):
+class MatBlock(AbstractMat):
+    """A proxy class for a local block in a monolithic :class:`.Mat`.
+
+    :arg parent: The parent monolithic matrix.
+    :arg i: The block row.
+    :arg j: The block column.
+    """
+    def __init__(self, parent, i, j):
+        self._parent = parent
+        self._i = i
+        self._j = j
+        self._sparsity = SparsityBlock(parent.sparsity, i, j)
+        rset, cset = self._parent.sparsity.dsets
+        rowis = rset.local_ises[i]
+        colis = cset.local_ises[j]
+        self.handle = parent.handle.getLocalSubMatrix(isrow=rowis,
+                                                      iscol=colis)
+        self.comm = parent.comm
+        self.local_to_global_maps = self.handle.getLGMap()
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return (self.handle.handle, )
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self._parent), self._parent.dtype, self.dims)
+
+    @property
+    def assembly_state(self):
+        # Track our assembly state only
+        return self._parent.assembly_state
+
+    @assembly_state.setter
+    def assembly_state(self, value):
+        self._parent.assembly_state = value
+
+    def __getitem__(self, idx):
+        return self
+
+    def __iter__(self):
+        yield self
+
+    def _flush_assembly(self):
+        # Need to flush for all blocks
+        for b in self._parent:
+            b.handle.assemble(assembly=PETSc.Mat.AssemblyType.FLUSH)
+        self._parent._flush_assembly()
+
+    def set_local_diagonal_entries(self, rows, diag_val=1.0, idx=None):
+        rows = np.asarray(rows, dtype=dtypes.IntType)
+        rbs, _ = self.dims[0][0]
+        if rbs > 1:
+            if idx is not None:
+                rows = rbs * rows + idx
+            else:
+                rows = np.dstack([rbs*rows + i for i in range(rbs)]).flatten()
+        rows = rows.reshape(-1, 1)
+        self.change_assembly_state(Mat.INSERT_VALUES)
+        if len(rows) > 0:
+            values = np.full(rows.shape, diag_val, dtype=dtypes.ScalarType)
+            self.handle.setValuesLocalRCV(rows, rows, values,
+                                          addv=PETSc.InsertMode.INSERT_VALUES)
+
+    def addto_values(self, rows, cols, values):
+        """Add a block of values to the :class:`Mat`."""
+        self.change_assembly_state(Mat.ADD_VALUES)
+        if len(values) > 0:
+            self.handle.setValuesBlockedLocal(rows, cols, values,
+                                              addv=PETSc.InsertMode.ADD_VALUES)
+
+    def set_values(self, rows, cols, values):
+        """Set a block of values in the :class:`Mat`."""
+        self.change_assembly_state(Mat.INSERT_VALUES)
+        if len(values) > 0:
+            self.handle.setValuesBlockedLocal(rows, cols, values,
+                                              addv=PETSc.InsertMode.INSERT_VALUES)
+
+    def assemble(self):
+        raise RuntimeError("Should never call assemble on MatBlock")
 
-    def log_flops(self, flops):
-        PETSc.Log.logFlops(flops)
+    @property
+    def values(self):
+        rset, cset = self._parent.sparsity.dsets
+        rowis = rset.field_ises[self._i]
+        colis = cset.field_ises[self._j]
+        self._parent.assemble()
+        mat = self._parent.handle.createSubMatrix(isrow=rowis,
+                                                  iscol=colis)
+        return mat[:, :]
+
+    @property
+    def dtype(self):
+        return self._parent.dtype
+
+    @property
+    def nbytes(self):
+        return self._parent.nbytes // (np.prod(self.sparsity.shape))
+
+    def __repr__(self):
+        return "MatBlock(%r, %r, %r)" % (self._parent, self._i, self._j)
+
+    def __str__(self):
+        return "Block[%s, %s] of %s" % (self._i, self._j, self._parent)
 
 
 def _DatMat(sparsity, dat=None):
@@ -929,6 +1038,7 @@ def _DatMat(sparsity, dat=None):
 class _DatMatPayload(object):
 
     def __init__(self, sparsity, dat=None, dset=None):
+        from pyop2.types.dat import Dat
         if isinstance(sparsity.dsets[0], GlobalDataSet):
             self.dset = sparsity.dsets[1]
             self.sizes = ((None, 1), (self.dset.size * self.dset.cdim, None))
@@ -939,7 +1049,7 @@ def __init__(self, sparsity, dat=None, dset=None):
             raise ValueError("Not a DatMat")
 
         self.sparsity = sparsity
-        self.dat = dat or _make_object("Dat", self.dset, dtype=PETSc.ScalarType)
+        self.dat = dat or Dat(self.dset, dtype=PETSc.ScalarType)
         self.dset = dset
 
     def __getitem__(self, key):
@@ -963,7 +1073,7 @@ def mult(self, mat, x, y):
                 # Column matrix
                 if x.sizes[1] == 1:
                     v.copy(y)
-                    a = np.zeros(1, dtype=ScalarType)
+                    a = np.zeros(1, dtype=dtypes.ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
@@ -979,7 +1089,7 @@ def multTranspose(self, mat, x, y):
                 # Row matrix
                 if x.sizes[1] == 1:
                     v.copy(y)
-                    a = np.zeros(1, dtype=ScalarType)
+                    a = np.zeros(1, dtype=dtypes.ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
@@ -1003,7 +1113,7 @@ def multTransposeAdd(self, mat, x, y, z):
                 # Row matrix
                 if x.sizes[1] == 1:
                     v.copy(z)
-                    a = np.zeros(1, dtype=ScalarType)
+                    a = np.zeros(1, dtype=dtypes.ScalarType)
                     if x.comm.rank == 0:
                         a[0] = x.array_r
                     else:
@@ -1052,7 +1162,8 @@ def _GlobalMat(global_=None, comm=None):
 class _GlobalMatPayload(object):
 
     def __init__(self, global_=None, comm=None):
-        self.global_ = global_ or _make_object("Global", 1, dtype=PETSc.ScalarType, comm=comm)
+        from pyop2.types.glob import Global
+        self.global_ = global_ or Global(1, dtype=PETSc.ScalarType, comm=comm)
 
     def __getitem__(self, key):
         return self.global_.data_ro.reshape(1, 1)[key]
diff --git a/pyop2/types/set.py b/pyop2/types/set.py
new file mode 100644
index 000000000..7702d87f7
--- /dev/null
+++ b/pyop2/types/set.py
@@ -0,0 +1,626 @@
+import ctypes
+import functools
+import numbers
+
+import numpy as np
+
+from pyop2 import (
+    caching,
+    datatypes as dtypes,
+    exceptions as ex,
+    mpi,
+    utils
+)
+
+
+class Set:
+
+    """OP2 set.
+
+    :param size: The size of the set.
+    :type size: integer or list of four integers.
+    :param string name: The name of the set (optional).
+    :param halo: An exisiting halo to use (optional).
+
+    When the set is employed as an iteration space in a
+    :func:`pyop2.op2.par_loop`, the extent of any local iteration space within
+    each set entry is indicated in brackets. See the example in
+    :func:`pyop2.op2.par_loop` for more details.
+
+    The size of the set can either be an integer, or a list of four
+    integers.  The latter case is used for running in parallel where
+    we distinguish between:
+
+      - `CORE` (owned and not touching halo)
+      - `OWNED` (owned, touching halo)
+      - `EXECUTE HALO` (not owned, but executed over redundantly)
+      - `NON EXECUTE HALO` (not owned, read when executing in the execute halo)
+
+    If a single integer is passed, we assume that we're running in
+    serial and there is no distinction.
+
+    The division of set elements is: ::
+
+        [0, CORE)
+        [CORE, OWNED)
+        [OWNED, GHOST)
+
+    Halo send/receive data is stored on sets in a :class:`Halo`.
+    """
+
+    _CORE_SIZE = 0
+    _OWNED_SIZE = 1
+    _GHOST_SIZE = 2
+
+    _extruded = False
+
+    _kernel_args_ = ()
+    _argtypes_ = ()
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return (type(self), )
+
+    @utils.validate_type(('size', (numbers.Integral, tuple, list, np.ndarray), ex.SizeTypeError),
+                         ('name', str, ex.NameTypeError))
+    def __init__(self, size, name=None, halo=None, comm=None):
+        self.comm = mpi.dup_comm(comm)
+        if isinstance(size, numbers.Integral):
+            size = [size] * 3
+        size = utils.as_tuple(size, numbers.Integral, 3)
+        assert size[Set._CORE_SIZE] <= size[Set._OWNED_SIZE] <= \
+            size[Set._GHOST_SIZE], "Set received invalid sizes: %s" % size
+        self._sizes = size
+        self._name = name or "set_#x%x" % id(self)
+        self._halo = halo
+        self._partition_size = 1024
+        # A cache of objects built on top of this set
+        self._cache = {}
+
+    @utils.cached_property
+    def core_size(self):
+        """Core set size.  Owned elements not touching halo elements."""
+        return self._sizes[Set._CORE_SIZE]
+
+    @utils.cached_property
+    def size(self):
+        """Set size, owned elements."""
+        return self._sizes[Set._OWNED_SIZE]
+
+    @utils.cached_property
+    def total_size(self):
+        """Set size including ghost elements.
+        """
+        return self._sizes[Set._GHOST_SIZE]
+
+    @utils.cached_property
+    def sizes(self):
+        """Set sizes: core, owned, execute halo, total."""
+        return self._sizes
+
+    @utils.cached_property
+    def core_part(self):
+        return SetPartition(self, 0, self.core_size)
+
+    @utils.cached_property
+    def owned_part(self):
+        return SetPartition(self, self.core_size, self.size - self.core_size)
+
+    @utils.cached_property
+    def name(self):
+        """User-defined label"""
+        return self._name
+
+    @utils.cached_property
+    def halo(self):
+        """:class:`Halo` associated with this Set"""
+        return self._halo
+
+    @property
+    def partition_size(self):
+        """Default partition size"""
+        return self._partition_size
+
+    @partition_size.setter
+    def partition_size(self, partition_value):
+        """Set the partition size"""
+        self._partition_size = partition_value
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __getitem__(self, idx):
+        """Allow indexing to return self"""
+        assert idx == 0
+        return self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __str__(self):
+        return "OP2 Set: %s with size %s" % (self._name, self.size)
+
+    def __repr__(self):
+        return "Set(%r, %r)" % (self._sizes, self._name)
+
+    def __call__(self, *indices):
+        """Build a :class:`Subset` from this :class:`Set`
+
+        :arg indices: The elements of this :class:`Set` from which the
+                      :class:`Subset` should be formed.
+
+        """
+        if len(indices) == 1:
+            indices = indices[0]
+            if np.isscalar(indices):
+                indices = [indices]
+        return Subset(self, indices)
+
+    def __contains__(self, dset):
+        """Indicate whether a given DataSet is compatible with this Set."""
+        from pyop2.types import DataSet
+        if isinstance(dset, DataSet):
+            return dset.set is self
+        else:
+            return False
+
+    def __pow__(self, e):
+        """Derive a :class:`DataSet` with dimension ``e``"""
+        from pyop2.types import DataSet
+        return DataSet(self, dim=e)
+
+    @utils.cached_property
+    def layers(self):
+        """Return None (not an :class:`ExtrudedSet`)."""
+        return None
+
+    def _check_operands(self, other):
+        if type(other) is Set:
+            if other is not self:
+                raise ValueError("Uable to perform set operations between two unrelated sets: %s and %s." % (self, other))
+        elif type(other) is Subset:
+            if self is not other._superset:
+                raise TypeError("Superset mismatch: self (%s) != other._superset (%s)" % (self, other._superset))
+        else:
+            raise TypeError("Unable to perform set operations between `Set` and %s." % (type(other), ))
+
+    def intersection(self, other):
+        self._check_operands(other)
+        return other
+
+    def union(self, other):
+        self._check_operands(other)
+        return self
+
+    def difference(self, other):
+        self._check_operands(other)
+        if other is self:
+            return Subset(self, [])
+        else:
+            return type(other)(self, np.setdiff1d(np.asarray(range(self.total_size), dtype=dtypes.IntType), other._indices))
+
+    def symmetric_difference(self, other):
+        self._check_operands(other)
+        return self.difference(other)
+
+
+class GlobalSet(Set):
+
+    _extruded = False
+
+    """A proxy set allowing a :class:`Global` to be used in place of a
+    :class:`Dat` where appropriate."""
+
+    _kernel_args_ = ()
+    _argtypes_ = ()
+
+    def __init__(self, comm=None):
+        self.comm = mpi.dup_comm(comm)
+        self._cache = {}
+
+    @utils.cached_property
+    def core_size(self):
+        return 0
+
+    @utils.cached_property
+    def size(self):
+        return 1 if self.comm.rank == 0 else 0
+
+    @utils.cached_property
+    def total_size(self):
+        """Total set size, including halo elements."""
+        return 1 if self.comm.rank == 0 else 0
+
+    @utils.cached_property
+    def sizes(self):
+        """Set sizes: core, owned, execute halo, total."""
+        return (self.core_size, self.size, self.total_size)
+
+    @utils.cached_property
+    def name(self):
+        """User-defined label"""
+        return "GlobalSet"
+
+    @utils.cached_property
+    def halo(self):
+        """:class:`Halo` associated with this Set"""
+        return None
+
+    @property
+    def partition_size(self):
+        """Default partition size"""
+        return None
+
+    def __iter__(self):
+        """Yield self when iterated over."""
+        yield self
+
+    def __getitem__(self, idx):
+        """Allow indexing to return self"""
+        assert idx == 0
+        return self
+
+    def __len__(self):
+        """This is not a mixed type and therefore of length 1."""
+        return 1
+
+    def __str__(self):
+        return "OP2 GlobalSet"
+
+    def __repr__(self):
+        return "GlobalSet()"
+
+    def __eq__(self, other):
+        # Currently all GlobalSets compare equal.
+        return isinstance(other, GlobalSet)
+
+    def __hash__(self):
+        # Currently all GlobalSets compare equal.
+        return hash(type(self))
+
+
+class ExtrudedSet(Set):
+
+    """OP2 ExtrudedSet.
+
+    :param parent: The parent :class:`Set` to build this :class:`ExtrudedSet` on top of
+    :type parent: a :class:`Set`.
+    :param layers: The number of layers in this :class:`ExtrudedSet`.
+    :type layers: an integer, indicating the number of layers for every entity,
+        or an array of shape (parent.total_size, 2) giving the start
+        and one past the stop layer for every entity.  An entry
+        ``a, b = layers[e, ...]`` means that the layers for entity
+        ``e`` run over :math:`[a, b)`.
+
+    The number of layers indicates the number of time the base set is
+    extruded in the direction of the :class:`ExtrudedSet`.  As a
+    result, there are ``layers-1`` extruded "cells" in an extruded set.
+    """
+
+    @utils.validate_type(('parent', Set, TypeError))
+    def __init__(self, parent, layers):
+        self._parent = parent
+        try:
+            layers = utils.verify_reshape(layers, dtypes.IntType, (parent.total_size, 2))
+            self.constant_layers = False
+            if layers.min() < 0:
+                raise ex.SizeTypeError("Bottom of layers must be >= 0")
+            if any(layers[:, 1] - layers[:, 0] < 1):
+                raise ex.SizeTypeError("Number of layers must be >= 0")
+        except ex.DataValueError:
+            # Legacy, integer
+            layers = np.asarray(layers, dtype=dtypes.IntType)
+            if layers.shape:
+                raise ex.SizeTypeError(f"Specifying layers per entity, but provided "
+                                       f"{layers.shape}, needed ({parent.total_size}, 2)")
+            if layers < 2:
+                raise ex.SizeTypeError("Need at least two layers, not %d", layers)
+            layers = np.asarray([[0, layers]], dtype=dtypes.IntType)
+            self.constant_layers = True
+
+        self._layers = layers
+        self._extruded = True
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return (self.layers_array.ctypes.data, )
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return (ctypes.c_voidp, )
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        return self.parent._wrapper_cache_key_ + (self.constant_layers, )
+
+    def __getattr__(self, name):
+        """Returns a :class:`Set` specific attribute."""
+        value = getattr(self._parent, name)
+        setattr(self, name, value)
+        return value
+
+    def __contains__(self, set):
+        return set is self.parent
+
+    def __str__(self):
+        return "OP2 ExtrudedSet: %s with size %s (%s layers)" % \
+            (self._name, self.size, self._layers)
+
+    def __repr__(self):
+        return "ExtrudedSet(%r, %r)" % (self._parent, self._layers)
+
+    @utils.cached_property
+    def parent(self):
+        return self._parent
+
+    @utils.cached_property
+    def layers(self):
+        """The layers of this extruded set."""
+        if self.constant_layers:
+            # Backwards compat
+            return self.layers_array[0, 1]
+        else:
+            raise ValueError("No single layer, use layers_array attribute")
+
+    @utils.cached_property
+    def layers_array(self):
+        return self._layers
+
+
+class Subset(ExtrudedSet):
+
+    """OP2 subset.
+
+    :param superset: The superset of the subset.
+    :type superset: a :class:`Set` or a :class:`Subset`.
+    :param indices: Elements of the superset that form the
+        subset. Duplicate values are removed when constructing the subset.
+    :type indices: a list of integers, or a numpy array.
+    """
+    @utils.validate_type(('superset', Set, TypeError),
+                         ('indices', (list, tuple, np.ndarray), TypeError))
+    def __init__(self, superset, indices):
+        # sort and remove duplicates
+        indices = np.unique(indices)
+        if isinstance(superset, Subset):
+            # Unroll indices to point to those in the parent
+            indices = superset.indices[indices]
+            superset = superset.superset
+        assert type(superset) is Set or type(superset) is ExtrudedSet, \
+            'Subset construction failed, should not happen'
+
+        self._superset = superset
+        self._indices = utils.verify_reshape(indices, dtypes.IntType, (len(indices),))
+
+        if len(self._indices) > 0 and (self._indices[0] < 0 or self._indices[-1] >= self._superset.total_size):
+            raise ex.SubsetIndexOutOfBounds(
+                'Out of bounds indices in Subset construction: [%d, %d) not [0, %d)' %
+                (self._indices[0], self._indices[-1], self._superset.total_size))
+
+        self._sizes = ((self._indices < superset.core_size).sum(),
+                       (self._indices < superset.size).sum(),
+                       len(self._indices))
+        self._extruded = superset._extruded
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        return self._superset._kernel_args_ + (self._indices.ctypes.data, )
+
+    @utils.cached_property
+    def _argtypes_(self):
+        return self._superset._argtypes_ + (ctypes.c_voidp, )
+
+    # Look up any unspecified attributes on the _set.
+    def __getattr__(self, name):
+        """Returns a :class:`Set` specific attribute."""
+        value = getattr(self._superset, name)
+        setattr(self, name, value)
+        return value
+
+    def __pow__(self, e):
+        """Derive a :class:`DataSet` with dimension ``e``"""
+        raise NotImplementedError("Deriving a DataSet from a Subset is unsupported")
+
+    def __str__(self):
+        return "OP2 Subset: %s with sizes %s" % \
+            (self._name, self._sizes)
+
+    def __repr__(self):
+        return "Subset(%r, %r)" % (self._superset, self._indices)
+
+    def __call__(self, *indices):
+        """Build a :class:`Subset` from this :class:`Subset`
+
+        :arg indices: The elements of this :class:`Subset` from which the
+                      :class:`Subset` should be formed.
+
+        """
+        if len(indices) == 1:
+            indices = indices[0]
+            if np.isscalar(indices):
+                indices = [indices]
+        return Subset(self, indices)
+
+    @utils.cached_property
+    def superset(self):
+        """Returns the superset Set"""
+        return self._superset
+
+    @utils.cached_property
+    def indices(self):
+        """Returns the indices pointing in the superset."""
+        return self._indices
+
+    @utils.cached_property
+    def owned_indices(self):
+        """Return the indices that correspond to the owned entities of the
+        superset.
+        """
+        return self.indices[self.indices < self.superset.size]
+
+    @utils.cached_property
+    def layers_array(self):
+        if self._superset.constant_layers:
+            return self._superset.layers_array
+        else:
+            return self._superset.layers_array[self.indices, ...]
+
+    def _check_operands(self, other):
+        if type(other) is Set:
+            if other is not self._superset:
+                raise TypeError("Superset mismatch: self._superset (%s) != other (%s)" % (self._superset, other))
+        elif type(other) is Subset:
+            if self._superset is not other._superset:
+                raise TypeError("Unable to perform set operation between subsets of mismatching supersets (%s != %s)" % (self._superset, other._superset))
+        else:
+            raise TypeError("Unable to perform set operations between `Subset` and %s." % (type(other), ))
+
+    def intersection(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return self
+        else:
+            return type(self)(self._superset, np.intersect1d(self._indices, other._indices))
+
+    def union(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return other
+        else:
+            return type(self)(self._superset, np.union1d(self._indices, other._indices))
+
+    def difference(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return Subset(other, [])
+        else:
+            return type(self)(self._superset, np.setdiff1d(self._indices, other._indices))
+
+    def symmetric_difference(self, other):
+        self._check_operands(other)
+        if other is self._superset:
+            return other.symmetric_difference(self)
+        else:
+            return type(self)(self._superset, np.setxor1d(self._indices, other._indices))
+
+
+class SetPartition:
+    def __init__(self, set, offset, size):
+        self.set = set
+        self.offset = offset
+        self.size = size
+
+
+class MixedSet(Set, caching.ObjectCached):
+    r"""A container for a bag of :class:`Set`\s."""
+
+    def __init__(self, sets):
+        r""":param iterable sets: Iterable of :class:`Set`\s or :class:`ExtrudedSet`\s"""
+        if self._initialized:
+            return
+        self._sets = sets
+        assert all(s is None or isinstance(s, GlobalSet) or ((s.layers == self._sets[0].layers).all() if s.layers is not None else True) for s in sets), \
+            "All components of a MixedSet must have the same number of layers."
+        # TODO: do all sets need the same communicator?
+        self.comm = functools.reduce(lambda a, b: a or b, map(lambda s: s if s is None else s.comm, sets))
+        self._initialized = True
+
+    @utils.cached_property
+    def _kernel_args_(self):
+        raise NotImplementedError
+
+    @utils.cached_property
+    def _argtypes_(self):
+        raise NotImplementedError
+
+    @utils.cached_property
+    def _wrapper_cache_key_(self):
+        raise NotImplementedError
+
+    @classmethod
+    def _process_args(cls, sets, **kwargs):
+        sets = [s for s in sets]
+        try:
+            sets = utils.as_tuple(sets, ExtrudedSet)
+        except TypeError:
+            sets = utils.as_tuple(sets, (Set, type(None)))
+        cache = sets[0]
+        return (cache, ) + (sets, ), kwargs
+
+    @classmethod
+    def _cache_key(cls, sets, **kwargs):
+        return sets
+
+    def __getitem__(self, idx):
+        """Return :class:`Set` with index ``idx`` or a given slice of sets."""
+        return self._sets[idx]
+
+    @utils.cached_property
+    def split(self):
+        r"""The underlying tuple of :class:`Set`\s."""
+        return self._sets
+
+    @utils.cached_property
+    def core_size(self):
+        """Core set size. Owned elements not touching halo elements."""
+        return sum(s.core_size for s in self._sets)
+
+    @utils.cached_property
+    def size(self):
+        """Set size, owned elements."""
+        return sum(0 if s is None else s.size for s in self._sets)
+
+    @utils.cached_property
+    def total_size(self):
+        """Total set size, including halo elements."""
+        return sum(s.total_size for s in self._sets)
+
+    @utils.cached_property
+    def sizes(self):
+        """Set sizes: core, owned, execute halo, total."""
+        return (self.core_size, self.size, self.total_size)
+
+    @utils.cached_property
+    def name(self):
+        """User-defined labels."""
+        return tuple(s.name for s in self._sets)
+
+    @utils.cached_property
+    def halo(self):
+        r""":class:`Halo`\s associated with these :class:`Set`\s."""
+        halos = tuple(s.halo for s in self._sets)
+        return halos if any(halos) else None
+
+    @utils.cached_property
+    def _extruded(self):
+        return isinstance(self._sets[0], ExtrudedSet)
+
+    @utils.cached_property
+    def layers(self):
+        """Numbers of layers in the extruded mesh (or None if this MixedSet is not extruded)."""
+        return self._sets[0].layers
+
+    def __iter__(self):
+        r"""Yield all :class:`Set`\s when iterated over."""
+        for s in self._sets:
+            yield s
+
+    def __len__(self):
+        """Return number of contained :class:`Set`s."""
+        return len(self._sets)
+
+    def __pow__(self, e):
+        """Derive a :class:`MixedDataSet` with dimensions ``e``"""
+        from pyop2.types import MixedDataSet
+        return MixedDataSet(self._sets, e)
+
+    def __str__(self):
+        return "OP2 MixedSet composed of Sets: %s" % (self._sets,)
+
+    def __repr__(self):
+        return "MixedSet(%r)" % (self._sets,)
+
+    def __eq__(self, other):
+        return type(self) == type(other) and self._sets == other._sets

From 24b72140b6ba0603e82c29727a03442036254417 Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 11:54:24 +0100
Subject: [PATCH 2/6] Moved pyparloop in parloop.py

---
 pyop2/compilation.py |   2 +-
 pyop2/kernel.py      |  20 ++++++
 pyop2/op2.py         |  21 +++---
 pyop2/parloop.py     | 163 +++++++++++++++++++++++++++++++++++------
 pyop2/pyparloop.py   | 168 -------------------------------------------
 5 files changed, 174 insertions(+), 200 deletions(-)
 delete mode 100644 pyop2/pyparloop.py

diff --git a/pyop2/compilation.py b/pyop2/compilation.py
index 97e0b4c0f..aabdaa9c1 100644
--- a/pyop2/compilation.py
+++ b/pyop2/compilation.py
@@ -48,7 +48,6 @@
 from pyop2.configuration import configuration
 from pyop2.logger import debug, progress, INFO
 from pyop2.exceptions import CompilationError
-from pyop2.base import JITModule
 
 
 def _check_hashes(x, y, datatype):
@@ -466,6 +465,7 @@ def load(jitmodule, extension, fn_name, cppargs=[], ldargs=[],
     :kwarg comm: Optional communicator to compile the code on (only
         rank 0 compiles code) (defaults to COMM_WORLD).
     """
+    from pyop2.parloop import JITModule
     if isinstance(jitmodule, str):
         class StrCode(object):
             def __init__(self, code, argtypes):
diff --git a/pyop2/kernel.py b/pyop2/kernel.py
index a73bbdf73..9a6c15387 100644
--- a/pyop2/kernel.py
+++ b/pyop2/kernel.py
@@ -128,3 +128,23 @@ def __repr__(self):
 
     def __eq__(self, other):
         return self.cache_key == other.cache_key
+
+
+class PyKernel(Kernel):
+    @classmethod
+    def _cache_key(cls, *args, **kwargs):
+        return None
+
+    def __init__(self, code, name=None, **kwargs):
+        self._func = code
+        self._name = name
+
+    def __getattr__(self, attr):
+        """Return None on unrecognised attributes"""
+        return None
+
+    def __call__(self, *args):
+        return self._func(*args)
+
+    def __repr__(self):
+        return 'Kernel("""%s""", %r)' % (self._func, self._name)
diff --git a/pyop2/op2.py b/pyop2/op2.py
index 84ac26056..9611afb34 100644
--- a/pyop2/op2.py
+++ b/pyop2/op2.py
@@ -39,15 +39,18 @@
 from pyop2.logger import debug, info, warning, error, critical, set_log_level
 from pyop2.mpi import MPI, COMM_WORLD, collective
 
-from pyop2.sequential import par_loop, Kernel  # noqa: F401
-from pyop2.sequential import READ, WRITE, RW, INC, MIN, MAX  # noqa: F401
-from pyop2.base import ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL  # noqa: F401
-from pyop2.sequential import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet  # noqa: F401
-from pyop2.sequential import Map, MixedMap, PermutedMap, Sparsity, Halo  # noqa: F401
-from pyop2.sequential import Global, GlobalDataSet        # noqa: F401
-from pyop2.sequential import Dat, MixedDat, DatView, Mat  # noqa: F401
-from pyop2.sequential import ParLoop as SeqParLoop
-from pyop2.pyparloop import ParLoop as PyParLoop
+from .types import (
+    Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet,
+    Map, MixedMap, PermutedMap, Sparsity, Halo,
+    Global, GlobalDataSet,
+    Dat, MixedDat, DatView, Mat
+)
+from .types.access import READ, WRITE, RW, INC, MIN, MAX
+
+from pyop2.parloop import par_loop, ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL
+from pyop2.kernel import Kernel
+
+from pyop2.parloop import ParLoop as SeqParLoop, PyParLoop
 
 import types
 import loopy
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index 462ad707c..8675fa6f1 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -1,8 +1,10 @@
+import abc
 import collections
 import copy
 import ctypes
 import enum
 import itertools
+import operator
 import os
 import types
 
@@ -461,15 +463,12 @@ class IterationRegion(enum.IntEnum):
 """Iterate over all cells of an extruded mesh."""
 
 
-class ParLoop:
+class AbstractParLoop(abc.ABC):
     """Represents the kernel, iteration space and arguments of a parallel loop
     invocation.
-
     .. note ::
-
         Users should not directly construct :class:`ParLoop` objects, but
         use :func:`pyop2.op2.par_loop` instead.
-
     An optional keyword argument, ``iterate``, can be used to specify
     which region of an :class:`ExtrudedSet` the parallel loop should
     iterate over.
@@ -522,6 +521,13 @@ def __init__(self, kernel, iterset, *args, **kwargs):
 
         self.arglist = self.prepare_arglist(iterset, *self.args)
 
+    def prepare_arglist(self, iterset, *args):
+        """Prepare the argument list for calling generated code.
+        :arg iterset: The :class:`Set` iterated over.
+        :arg args: A list of :class:`Args`, the argument to the :fn:`par_loop`.
+        """
+        return ()
+
     @utils.cached_property
     def num_flops(self):
         iterset = self.iterset
@@ -535,6 +541,16 @@ def num_flops(self):
                 size = layers - 1
         return size * self._kernel.num_flops
 
+    def log_flops(self, flops):
+        pass
+
+    @property
+    @mpi.collective
+    def _jitmodule(self):
+        """Return the :class:`JITModule` that encapsulates the compiled par_loop code.
+        Return None if the child class should deal with this in another way."""
+        return None
+
     @utils.cached_property
     def _parloop_event(self):
         return profiling.timed_region("ParLoopExecute")
@@ -583,6 +599,16 @@ def compute(self):
             self.reduction_end()
             self.local_to_global_end()
 
+    @mpi.collective
+    def _compute(self, part, fun, *arglist):
+        """Executes the kernel over all members of a MPI-part of the iteration space.
+        :arg part: The :class:`SetPartition` to compute over
+        :arg fun: The :class:`JITModule` encapsulating the compiled
+             code (may be ignored by the backend).
+        :arg arglist: The arguments to pass to the compiled code (may
+             be ignored by the backend, depending on the exact implementation)"""
+        raise RuntimeError("Must select a backend")
+
     @mpi.collective
     def global_to_local_begin(self):
         """Start halo exchanges."""
@@ -643,7 +669,6 @@ def reduction_end(self):
     @mpi.collective
     def update_arg_data_state(self):
         r"""Update the state of the :class:`DataCarrier`\s in the arguments to the `par_loop`.
-
         This marks :class:`Mat`\s that need assembly."""
         for arg in self.args:
             access = arg.access
@@ -700,15 +725,13 @@ def iteration_region(self):
         interior facets."""
         return self._iteration_region
 
+
+class ParLoop(AbstractParLoop):
+
     def log_flops(self, flops):
         PETSc.Log.logFlops(flops)
 
     def prepare_arglist(self, iterset, *args):
-        """Prepare the argument list for calling generated code.
-
-        :arg iterset: The :class:`Set` iterated over.
-        :arg args: A list of :class:`Args`, the argument to the :fn:`par_loop`.
-        """
         arglist = iterset._kernel_args_
         for arg in args:
             arglist += arg._kernel_args_
@@ -727,9 +750,6 @@ def prepare_arglist(self, iterset, *args):
 
     @utils.cached_property
     def _jitmodule(self):
-        """Return the :class:`JITModule` that encapsulates the compiled par_loop code.
-
-        Return None if the child class should deal with this in another way."""
         return JITModule(self.kernel, self.iterset, *self.args,
                          iterate=self.iteration_region,
                          pass_layer_arg=self._pass_layer_arg)
@@ -740,18 +760,118 @@ def _compute_event(self):
 
     @mpi.collective
     def _compute(self, part, fun, *arglist):
-        """Executes the kernel over all members of a MPI-part of the iteration space.
-
-        :arg part: The :class:`SetPartition` to compute over
-        :arg fun: The :class:`JITModule` encapsulating the compiled
-             code (may be ignored by the backend).
-        :arg arglist: The arguments to pass to the compiled code (may
-             be ignored by the backend, depending on the exact implementation)"""
         with self._compute_event:
             self.log_flops(part.size * self.num_flops)
             fun(part.offset, part.offset + part.size, *arglist)
 
 
+class PyParLoop(AbstractParLoop):
+    """A stub implementation of "Python" parallel loops.
+
+    This basically executes a python function over the iteration set,
+    feeding it the appropriate data for each set entity.
+
+    Example usage::
+
+    .. code-block:: python
+
+       s = op2.Set(10)
+       d = op2.Dat(s)
+       d2 = op2.Dat(s**2)
+
+       m = op2.Map(s, s, 2, np.dstack(np.arange(4),
+                                      np.roll(np.arange(4), -1)))
+
+       def fn(x, y):
+           x[0] = y[0]
+           x[1] = y[1]
+
+       d.data[:] = np.arange(4)
+
+       op2.par_loop(fn, s, d2(op2.WRITE), d(op2.READ, m))
+
+       print d2.data
+       # [[ 0.  1.]
+       #  [ 1.  2.]
+       #  [ 2.  3.]
+       #  [ 3.  0.]]
+
+      def fn2(x, y):
+          x[0] += y[0]
+          x[1] += y[0]
+
+      op2.par_loop(fn, s, d2(op2.INC), d(op2.READ, m[1]))
+
+      print d2.data
+      # [[ 1.  2.]
+      #  [ 3.  4.]
+      #  [ 5.  6.]
+      #  [ 3.  0.]]
+    """
+    def __init__(self, kernel, *args, **kwargs):
+        if not isinstance(kernel, types.FunctionType):
+            raise ValueError("Expecting a python function, not a %r" % type(kernel))
+        super().__init__(Kernel(kernel), *args, **kwargs)
+
+    def _compute(self, part, *arglist):
+        if part.set._extruded:
+            raise NotImplementedError
+        subset = isinstance(self.iterset, Subset)
+
+        def arrayview(array, access):
+            array = array.view()
+            array.setflags(write=(access is not Access.READ))
+            return array
+
+        # Just walk over the iteration set
+        for e in range(part.offset, part.offset + part.size):
+            args = []
+            if subset:
+                idx = self.iterset._indices[e]
+            else:
+                idx = e
+            for arg in self.args:
+                if arg._is_global:
+                    args.append(arrayview(arg.data._data, arg.access))
+                elif arg._is_direct:
+                    args.append(arrayview(arg.data._data[idx, ...], arg.access))
+                elif arg._is_indirect:
+                    args.append(arrayview(arg.data._data[arg.map.values_with_halo[idx], ...], arg.access))
+                elif arg._is_mat:
+                    if arg.access not in {Access.INC, Access.WRITE}:
+                        raise NotImplementedError
+                    if arg._is_mixed_mat:
+                        raise ValueError("Mixed Mats must be split before assembly")
+                    shape = tuple(map(operator.attrgetter("arity"), arg.map_tuple))
+                    args.append(np.zeros(shape, dtype=arg.data.dtype))
+                if args[-1].shape == ():
+                    args[-1] = args[-1].reshape(1)
+            self._kernel(*args)
+            for arg, tmp in zip(self.args, args):
+                if arg.access is Access.READ:
+                    continue
+                if arg._is_global:
+                    arg.data._data[:] = tmp[:]
+                elif arg._is_direct:
+                    arg.data._data[idx, ...] = tmp[:]
+                elif arg._is_indirect:
+                    arg.data._data[arg.map.values_with_halo[idx], ...] = tmp[:]
+                elif arg._is_mat:
+                    if arg.access is Access.INC:
+                        arg.data.addto_values(arg.map[0].values_with_halo[idx],
+                                              arg.map[1].values_with_halo[idx],
+                                              tmp)
+                    elif arg.access is Access.WRITE:
+                        arg.data.set_values(arg.map[0].values_with_halo[idx],
+                                            arg.map[1].values_with_halo[idx],
+                                            tmp)
+
+        for arg in self.args:
+            if arg._is_mat and arg.access is not Access.READ:
+                # Queue up assembly of matrix
+                arg.data.assemble()
+
+
 def check_iterset(args, iterset):
     """Checks that the iteration set of the :class:`ParLoop` matches the
     iteration set of all its arguments. A :class:`MapValueError` is raised
@@ -848,8 +968,7 @@ def par_loop(kernel, iterset, *args, **kwargs):
     passed to the kernel as a vector.
     """
     if isinstance(kernel, types.FunctionType):
-        from pyop2 import pyparloop
-        return pyparloop.ParLoop(kernel, iterset, *args, **kwargs).compute()
+        return PyParLoop(kernel, iterset, *args, **kwargs).compute()
     return ParLoop(kernel, iterset, *args, **kwargs).compute()
 
 
diff --git a/pyop2/pyparloop.py b/pyop2/pyparloop.py
deleted file mode 100644
index 8d1381f60..000000000
--- a/pyop2/pyparloop.py
+++ /dev/null
@@ -1,168 +0,0 @@
-# This file is part of PyOP2
-#
-# PyOP2 is Copyright (c) 2012-2014, Imperial College London and
-# others. Please see the AUTHORS file in the main source directory for
-# a full list of copyright holders.  All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions
-# are met:
-#
-#     * Redistributions of source code must retain the above copyright
-#       notice, this list of conditions and the following disclaimer.
-#     * Redistributions in binary form must reproduce the above copyright
-#       notice, this list of conditions and the following disclaimer in the
-#       documentation and/or other materials provided with the distribution.
-#     * The name of Imperial College London or that of other
-#       contributors may not be used to endorse or promote products
-#       derived from this software without specific prior written
-#       permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTERS
-# ''AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
-# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
-# COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
-# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
-# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
-# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
-# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
-# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
-# OF THE POSSIBILITY OF SUCH DAMAGE.
-
-"""A stub implementation of "Python" parallel loops.
-
-This basically executes a python function over the iteration set,
-feeding it the appropriate data for each set entity.
-
-Example usage::
-
-.. code-block:: python
-
-   s = op2.Set(10)
-   d = op2.Dat(s)
-   d2 = op2.Dat(s**2)
-
-   m = op2.Map(s, s, 2, np.dstack(np.arange(4),
-                                  np.roll(np.arange(4), -1)))
-
-   def fn(x, y):
-       x[0] = y[0]
-       x[1] = y[1]
-
-   d.data[:] = np.arange(4)
-
-   op2.par_loop(fn, s, d2(op2.WRITE), d(op2.READ, m))
-
-   print d2.data
-   # [[ 0.  1.]
-   #  [ 1.  2.]
-   #  [ 2.  3.]
-   #  [ 3.  0.]]
-
-  def fn2(x, y):
-      x[0] += y[0]
-      x[1] += y[0]
-
-  op2.par_loop(fn, s, d2(op2.INC), d(op2.READ, m[1]))
-
-  print d2.data
-  # [[ 1.  2.]
-  #  [ 3.  4.]
-  #  [ 5.  6.]
-  #  [ 3.  0.]]
-"""
-
-from operator import attrgetter
-import numpy as np
-import types
-from pyop2 import base
-
-
-# Fake kernel for type checking
-class Kernel(base.Kernel):
-    @classmethod
-    def _cache_key(cls, *args, **kwargs):
-        return None
-
-    def __init__(self, code, name=None, **kwargs):
-        self._func = code
-        self._name = name
-
-    def __getattr__(self, attr):
-        """Return None on unrecognised attributes"""
-        return None
-
-    def __call__(self, *args):
-        return self._func(*args)
-
-    def __repr__(self):
-        return 'Kernel("""%s""", %r)' % (self._func, self._name)
-
-
-# Inherit from parloop for type checking and init
-class ParLoop(base.ParLoop):
-
-    def __init__(self, kernel, *args, **kwargs):
-        if not isinstance(kernel, types.FunctionType):
-            raise ValueError("Expecting a python function, not a %r" % type(kernel))
-        super().__init__(Kernel(kernel), *args, **kwargs)
-
-    def _compute(self, part, *arglist):
-        if part.set._extruded:
-            raise NotImplementedError
-        subset = isinstance(self.iterset, base.Subset)
-
-        def arrayview(array, access):
-            array = array.view()
-            array.setflags(write=(access is not base.READ))
-            return array
-
-        # Just walk over the iteration set
-        for e in range(part.offset, part.offset + part.size):
-            args = []
-            if subset:
-                idx = self.iterset._indices[e]
-            else:
-                idx = e
-            for arg in self.args:
-                if arg._is_global:
-                    args.append(arrayview(arg.data._data, arg.access))
-                elif arg._is_direct:
-                    args.append(arrayview(arg.data._data[idx, ...], arg.access))
-                elif arg._is_indirect:
-                    args.append(arrayview(arg.data._data[arg.map.values_with_halo[idx], ...], arg.access))
-                elif arg._is_mat:
-                    if arg.access not in [base.INC, base.WRITE]:
-                        raise NotImplementedError
-                    if arg._is_mixed_mat:
-                        raise ValueError("Mixed Mats must be split before assembly")
-                    shape = tuple(map(attrgetter("arity"), arg.map_tuple))
-                    args.append(np.zeros(shape, dtype=arg.data.dtype))
-                if args[-1].shape == ():
-                    args[-1] = args[-1].reshape(1)
-            self._kernel(*args)
-            for arg, tmp in zip(self.args, args):
-                if arg.access is base.READ:
-                    continue
-                if arg._is_global:
-                    arg.data._data[:] = tmp[:]
-                elif arg._is_direct:
-                    arg.data._data[idx, ...] = tmp[:]
-                elif arg._is_indirect:
-                    arg.data._data[arg.map.values_with_halo[idx], ...] = tmp[:]
-                elif arg._is_mat:
-                    if arg.access is base.INC:
-                        arg.data.addto_values(arg.map[0].values_with_halo[idx],
-                                              arg.map[1].values_with_halo[idx],
-                                              tmp)
-                    elif arg.access is base.WRITE:
-                        arg.data.set_values(arg.map[0].values_with_halo[idx],
-                                            arg.map[1].values_with_halo[idx],
-                                            tmp)
-
-        for arg in self.args:
-            if arg._is_mat and arg.access is not base.READ:
-                # Queue up assembly of matrix
-                arg.data.assemble()

From 222ee582fa271f77a8b28de197779c902cbefc6b Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 12:05:15 +0100
Subject: [PATCH 3/6] All tests passing

---
 pyop2/codegen/rep2loopy.py         |  2 +-
 pyop2/parloop.py                   | 10 +++++-----
 pyop2/types/dat.py                 |  2 +-
 pyop2/types/glob.py                |  2 +-
 test/unit/test_api.py              | 17 +++++++----------
 test/unit/test_caching.py          |  8 +++++---
 test/unit/test_global_reduction.py |  3 +--
 7 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/pyop2/codegen/rep2loopy.py b/pyop2/codegen/rep2loopy.py
index 2dd21310e..ba8f17fb4 100644
--- a/pyop2/codegen/rep2loopy.py
+++ b/pyop2/codegen/rep2loopy.py
@@ -19,7 +19,7 @@
 
 from pyop2.codegen.node import traversal, Node, Memoizer, reuse_if_untouched
 
-from pyop2.base import READ, WRITE
+from pyop2.types.access import READ, WRITE
 from pyop2.datatypes import as_ctypes
 
 from pyop2.codegen.optimise import index_merger, rename_nodes
diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index 8675fa6f1..e18b35732 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -22,10 +22,10 @@
     profiling,
     utils
 )
-from .kernel import Kernel
+from .kernel import Kernel, PyKernel
 from .types import (
     Access,
-    Global, Dat, Mat, Map, MixedDat,
+    Global, Dat, DatView, Mat, Map, MixedDat, AbstractDat,
     Set, MixedSet, ExtrudedSet, Subset
 )
 
@@ -190,7 +190,7 @@ def access(self):
 
     @utils.cached_property
     def _is_dat_view(self):
-        return isinstance(self.data, types.DatView)
+        return isinstance(self.data, DatView)
 
     @utils.cached_property
     def _is_mat(self):
@@ -210,7 +210,7 @@ def _is_global_reduction(self):
 
     @utils.cached_property
     def _is_dat(self):
-        return isinstance(self.data, Dat)
+        return isinstance(self.data, AbstractDat)
 
     @utils.cached_property
     def _is_mixed_dat(self):
@@ -811,7 +811,7 @@ def fn2(x, y):
     def __init__(self, kernel, *args, **kwargs):
         if not isinstance(kernel, types.FunctionType):
             raise ValueError("Expecting a python function, not a %r" % type(kernel))
-        super().__init__(Kernel(kernel), *args, **kwargs)
+        super().__init__(PyKernel(kernel), *args, **kwargs)
 
     def _compute(self, part, *arglist):
         if part.set._extruded:
diff --git a/pyop2/types/dat.py b/pyop2/types/dat.py
index b238f8ae1..9abfa6d9c 100644
--- a/pyop2/types/dat.py
+++ b/pyop2/types/dat.py
@@ -167,7 +167,7 @@ def data_with_halos(self):
         With this accessor, you get to see up to date halo values, but
         you should not try and modify them, because they will be
         overwritten by the next halo exchange."""
-        self.global_to_local_begin(Access.Access.RW)
+        self.global_to_local_begin(Access.RW)
         self.global_to_local_end(Access.RW)
         self.halo_valid = False
         v = self._data.view()
diff --git a/pyop2/types/glob.py b/pyop2/types/glob.py
index 5651db693..9470570e8 100644
--- a/pyop2/types/glob.py
+++ b/pyop2/types/glob.py
@@ -66,7 +66,7 @@ def _wrapper_cache_key_(self):
 
     @utils.validate_in(('access', _modes, ex.ModeValueError))
     def __call__(self, access, path=None):
-        from parloop import Arg
+        from pyop2.parloop import Arg
         return Arg(data=self, access=access)
 
     def __iter__(self):
diff --git a/test/unit/test_api.py b/test/unit/test_api.py
index eee28bb35..777eac4d3 100644
--- a/test/unit/test_api.py
+++ b/test/unit/test_api.py
@@ -39,10 +39,7 @@
 import numpy as np
 from numpy.testing import assert_equal
 
-from pyop2 import op2
-from pyop2 import exceptions
-from pyop2 import sequential
-from pyop2 import base
+from pyop2 import exceptions, op2
 
 
 @pytest.fixture
@@ -358,7 +355,7 @@ def test_iteration_incompatibility(self, set, m_iterset_toset, dat):
         e = op2.ExtrudedSet(set, 5)
         k = op2.Kernel('static void k() { }', 'k')
         with pytest.raises(exceptions.MapValueError):
-            base.ParLoop(k, e, dat(op2.READ, m_iterset_toset))
+            op2.ParLoop(k, e, dat(op2.READ, m_iterset_toset))
 
 
 class TestSubsetAPI:
@@ -508,7 +505,7 @@ def test_mixed_set_ne_set(self, sets):
     def test_mixed_set_repr(self, mset):
         "MixedSet repr should produce a MixedSet object when eval'd."
         from pyop2.op2 import Set, MixedSet  # noqa: needed by eval
-        assert isinstance(eval(repr(mset)), base.MixedSet)
+        assert isinstance(eval(repr(mset)), op2.MixedSet)
 
     def test_mixed_set_str(self, mset):
         "MixedSet should have the expected string representation."
@@ -718,7 +715,7 @@ def test_mixed_dset_ne_dset(self, diterset, dtoset):
     def test_mixed_dset_repr(self, mdset):
         "MixedDataSet repr should produce a MixedDataSet object when eval'd."
         from pyop2.op2 import Set, DataSet, MixedDataSet  # noqa: needed by eval
-        assert isinstance(eval(repr(mdset)), base.MixedDataSet)
+        assert isinstance(eval(repr(mdset)), op2.MixedDataSet)
 
     def test_mixed_dset_str(self, mdset):
         "MixedDataSet should have the expected string representation."
@@ -1000,7 +997,7 @@ def test_mixed_dat_repr(self, mdat):
         "MixedDat repr should produce a MixedDat object when eval'd."
         from pyop2.op2 import Set, DataSet, MixedDataSet, Dat, MixedDat  # noqa: needed by eval
         from numpy import dtype  # noqa: needed by eval
-        assert isinstance(eval(repr(mdat)), base.MixedDat)
+        assert isinstance(eval(repr(mdat)), op2.MixedDat)
 
     def test_mixed_dat_str(self, mdat):
         "MixedDat should have the expected string representation."
@@ -1220,7 +1217,7 @@ def test_mat_illegal_sets(self):
 
     def test_mat_illegal_name(self, sparsity):
         "Mat name should be string."
-        with pytest.raises(sequential.NameTypeError):
+        with pytest.raises(exceptions.NameTypeError):
             op2.Mat(sparsity, name=2)
 
     def test_mat_dtype(self, mat):
@@ -1663,7 +1660,7 @@ def test_illegal_dat_iterset(self):
         map = op2.Map(set2, set1, 1, [0, 0, 0])
         kernel = op2.Kernel("void k() { }", "k")
         with pytest.raises(exceptions.MapValueError):
-            base.ParLoop(kernel, set1, dat(op2.READ, map))
+            op2.ParLoop(kernel, set1, dat(op2.READ, map))
 
     def test_illegal_mat_iterset(self, sparsity):
         """ParLoop should reject a Mat argument using a different iteration
diff --git a/test/unit/test_caching.py b/test/unit/test_caching.py
index f3c68e0ef..783f6cf4e 100644
--- a/test/unit/test_caching.py
+++ b/test/unit/test_caching.py
@@ -34,7 +34,9 @@
 
 import pytest
 import numpy
-from pyop2 import op2, base
+from pyop2 import op2
+import pyop2.kernel
+import pyop2.parloop
 
 from coffee.base import *
 
@@ -280,7 +282,7 @@ class TestGeneratedCodeCache:
     Generated Code Cache Tests.
     """
 
-    cache = base.JITModule._cache
+    cache = pyop2.parloop.JITModule._cache
 
     @pytest.fixture
     def a(cls, diterset):
@@ -470,7 +472,7 @@ class TestKernelCache:
     Kernel caching tests.
     """
 
-    cache = base.Kernel._cache
+    cache = pyop2.kernel.Kernel._cache
 
     def test_kernels_same_code_same_name(self):
         """Kernels with same code and name should be retrieved from cache."""
diff --git a/test/unit/test_global_reduction.py b/test/unit/test_global_reduction.py
index 4f3d6e29a..0a2f7ee68 100644
--- a/test/unit/test_global_reduction.py
+++ b/test/unit/test_global_reduction.py
@@ -449,10 +449,9 @@ def test_inc_repeated_loop(self, set):
         assert_allclose(g.data, set.size)
 
     def test_inc_reused_loop(self, set):
-        from pyop2.sequential import ParLoop
         g = op2.Global(1, 0, dtype=numpy.uint32)
         k = """void k(unsigned int* g) { *g += 1; }"""
-        loop = ParLoop(op2.Kernel(k, "k"),
+        loop = op2.ParLoop(op2.Kernel(k, "k"),
                        set,
                        g(op2.INC))
         loop.compute()

From 2e6043462cb1e9e52e7c91b528d0e1f53d8b0621 Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 13:00:07 +0100
Subject: [PATCH 4/6] Fix Arg _is_mat check

---
 pyop2/parloop.py   | 4 ++--
 pyop2/types/mat.py | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/pyop2/parloop.py b/pyop2/parloop.py
index e18b35732..081fb33cc 100644
--- a/pyop2/parloop.py
+++ b/pyop2/parloop.py
@@ -25,7 +25,7 @@
 from .kernel import Kernel, PyKernel
 from .types import (
     Access,
-    Global, Dat, DatView, Mat, Map, MixedDat, AbstractDat,
+    Global, Dat, DatView, Mat, Map, MixedDat, AbstractDat, AbstractMat,
     Set, MixedSet, ExtrudedSet, Subset
 )
 
@@ -194,7 +194,7 @@ def _is_dat_view(self):
 
     @utils.cached_property
     def _is_mat(self):
-        return isinstance(self.data, Mat)
+        return isinstance(self.data, AbstractMat)
 
     @utils.cached_property
     def _is_mixed_mat(self):
diff --git a/pyop2/types/mat.py b/pyop2/types/mat.py
index 2ffdae6ff..f7da86547 100644
--- a/pyop2/types/mat.py
+++ b/pyop2/types/mat.py
@@ -1035,7 +1035,7 @@ def _DatMat(sparsity, dat=None):
     return A
 
 
-class _DatMatPayload(object):
+class _DatMatPayload:
 
     def __init__(self, sparsity, dat=None, dset=None):
         from pyop2.types.dat import Dat
@@ -1159,7 +1159,7 @@ def _GlobalMat(global_=None, comm=None):
     return A
 
 
-class _GlobalMatPayload(object):
+class _GlobalMatPayload:
 
     def __init__(self, global_=None, comm=None):
         from pyop2.types.glob import Global

From 55d96372d46ca0b4e92a87c4982e45a6962b5b95 Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 13:18:51 +0100
Subject: [PATCH 5/6] Fix linting

---
 test/unit/test_global_reduction.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/test/unit/test_global_reduction.py b/test/unit/test_global_reduction.py
index 0a2f7ee68..fa2258924 100644
--- a/test/unit/test_global_reduction.py
+++ b/test/unit/test_global_reduction.py
@@ -451,9 +451,7 @@ def test_inc_repeated_loop(self, set):
     def test_inc_reused_loop(self, set):
         g = op2.Global(1, 0, dtype=numpy.uint32)
         k = """void k(unsigned int* g) { *g += 1; }"""
-        loop = op2.ParLoop(op2.Kernel(k, "k"),
-                       set,
-                       g(op2.INC))
+        loop = op2.ParLoop(op2.Kernel(k, "k"), set, g(op2.INC))
         loop.compute()
         assert_allclose(g.data, set.size)
         loop.compute()

From e4a9de6fe8c422ce2a03a758c4abcce4dc8550ea Mon Sep 17 00:00:00 2001
From: Connor Ward <c.ward20@imperial.ac.uk>
Date: Wed, 29 Sep 2021 14:34:18 +0100
Subject: [PATCH 6/6] Add subpackage to setup.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index 3b30a377d..32a20fa16 100644
--- a/setup.py
+++ b/setup.py
@@ -147,7 +147,7 @@ def run(self):
       install_requires=install_requires,
       dependency_links=dep_links,
       test_requires=test_requires,
-      packages=['pyop2', 'pyop2.codegen'],
+      packages=['pyop2', 'pyop2.codegen', 'pyop2.types'],
       package_data={
           'pyop2': ['assets/*', '*.h', '*.pxd', '*.pyx', 'codegen/c/*.c']},
       scripts=glob('scripts/*'),