From 0cb84c68f44b94c3f80aceeaf5cde1efba71e77e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 14:49:37 -0500 Subject: [PATCH 01/12] introduces an abstract backend class --- pyop2/backend.py | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 pyop2/backend.py diff --git a/pyop2/backend.py b/pyop2/backend.py new file mode 100644 index 000000000..c3299c2b6 --- /dev/null +++ b/pyop2/backend.py @@ -0,0 +1,38 @@ +class _not_implemented: # noqa + """Not Implemented""" + + +class AbstractComputeBackend: + Arg = _not_implemented() + ParLoop = _not_implemented() + Kernel = _not_implemented() + Dat = _not_implemented() + READ = _not_implemented() + WRITE = _not_implemented() + RW = _not_implemented() + INC = _not_implemented() + MIN = _not_implemented() + MAX = _not_implemented() + Set = _not_implemented() + ExtrudedSet = _not_implemented() + MixedSet = _not_implemented() + Subset = _not_implemented() + DataSet = _not_implemented() + MixedDataSet = _not_implemented() + Map = _not_implemented() + MixedMap = _not_implemented() + Sparsity = _not_implemented() + Halo = _not_implemented() + Dat = _not_implemented() + MixedDat = _not_implemented() + DatView = _not_implemented() + Mat = _not_implemented() + Global = _not_implemented() + GlobalDataSet = _not_implemented() + + def _getattr_(self, key): + val = super(AbstractComputeBackend, self)._getattr_(key) + if isinstance(val, _not_implemented): + raise NotImplementedError("'{}' is not implemented for backend" + " '{}'.".format(val, self.__name__)) + return val From 5822ae686d17fc3bb0ee37f4e40e25c62d1321bd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 14:52:57 -0500 Subject: [PATCH 02/12] global to store the current device --- pyop2/op2.py | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/pyop2/op2.py b/pyop2/op2.py index d15ff1712..641bed183 100644 --- a/pyop2/op2.py +++ b/pyop2/op2.py @@ -39,16 +39,18 @@ from pyop2.logger import debug, info, warning, error, critical, set_log_level from pyop2.mpi import MPI, COMM_WORLD, collective -from pyop2.gpu.cuda import par_loop, Kernel # noqa: F401 -from pyop2.gpu.cuda import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 from pyop2.base import ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL # noqa: F401 -from pyop2.gpu.cuda import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet # noqa: F401 -from pyop2.gpu.cuda import Map, MixedMap, Sparsity, Halo # noqa: F401 -from pyop2.gpu.cuda import Dat, MixedDat, DatView, Mat # noqa: F401 -from pyop2.gpu.cuda import Global, GlobalDataSet # noqa: F401 -from pyop2.gpu.cuda import ParLoop as SeqParLoop # noqa: F401 -from pyop2.pyparloop import ParLoop as PyParLoop +import pyop2.sequential +from pyop2.sequential import par_loop, Kernel # noqa: F401 +from pyop2.sequential import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 +from pyop2.sequential import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet # noqa: F401 +from pyop2.sequential import Map, MixedMap, Sparsity, Halo # noqa: F401 +from pyop2.sequential import Dat, MixedDat, DatView, Mat # noqa: F401 +from pyop2.sequential import Global, GlobalDataSet # noqa: F401 +from pyop2.sequential import ParLoop as SeqParLoop # noqa: F401 +from pyop2.sequential import sequential_cpu_backend +from pyop2.pyparloop import ParLoop as PyParLoop import types import loopy @@ -66,7 +68,7 @@ def ParLoop(kernel, *args, **kwargs): if isinstance(kernel, types.FunctionType): return PyParLoop(kernel, *args, **kwargs) else: - return SeqParLoop(kernel, *args, **kwargs) + return compute_backend.ParLoop(kernel, *args, **kwargs) _initialised = False @@ -122,3 +124,6 @@ def exit(): configuration.reset() global _initialised _initialised = False + + +compute_backend = sequential_cpu_backend From 6fd5c2d612586c9abcc42fb1ae334dd8cd5ecfa5 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 14:57:27 -0500 Subject: [PATCH 03/12] choose all data structures from the compute backend --- pyop2/base.py | 26 ++++++++------------------ 1 file changed, 8 insertions(+), 18 deletions(-) diff --git a/pyop2/base.py b/pyop2/base.py index 512e3ca71..927767a6d 100644 --- a/pyop2/base.py +++ b/pyop2/base.py @@ -65,8 +65,10 @@ def _make_object(name, *args, **kwargs): - from pyop2.gpu import cuda as backend - return getattr(backend, name)(*args, **kwargs) + # TODO: All "make_object("xyz", ...)" should be replaced by + # "compute_backend.xyz(...)"? + from pyop2.op2 import compute_backend + return getattr(compute_backend, name)(*args, **kwargs) # Data API @@ -1355,7 +1357,6 @@ def pack(self): ('name', str, NameTypeError)) @validate_dtype(('dtype', None, DataTypeError)) def __init__(self, dataset, data=None, dtype=None, name=None, uid=None): - if isinstance(dataset, Dat): self.__init__(dataset.dataset, None, dtype=dataset.dtype, name="copy_of_%s" % dataset.name) @@ -2502,16 +2503,9 @@ def __init__(self, iterset, toset, arity, values=None, name=None, offset=None): self._toset = toset self.comm = toset.comm self._arity = arity - if False: - # maps indexed as `map[idof, icell]` - self._values = verify_reshape(values, IntType, - (arity, iterset.total_size), - allow_none=True) - else: - # maps indexed as `map[icell, idof]` - self._values = verify_reshape(values, IntType, - (iterset.total_size, arity), - allow_none=True) + self._values = verify_reshape(values, IntType, + (iterset.total_size, arity), + allow_none=True) self.shape = (iterset.total_size, arity) self._name = name or "map_%d" % Map._globalcount if offset is None or len(offset) == 0: @@ -2589,11 +2583,7 @@ def values(self): This only returns the map values for local points, to see the halo points too, use :meth:`values_with_halo`.""" - if False: - # Transposed maps - return self._values[:, :self.iterset.size] - else: - return self._values[:self.iterset.size] + return self._values[:self.iterset.size] @cached_property def values_with_halo(self): From 8c8edaa57be757111f46414d315d858d61bd5195 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 15:02:24 -0500 Subject: [PATCH 04/12] adds option to disallow implicit host<->device transfers --- pyop2/configuration.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pyop2/configuration.py b/pyop2/configuration.py index a09a8e667..eaee81524 100644 --- a/pyop2/configuration.py +++ b/pyop2/configuration.py @@ -71,6 +71,10 @@ class Configuration(dict): cdim > 1 be built as block sparsities, or dof sparsities. The former saves memory but changes which preconditioners are available for the resulting matrices. (Default yes) + :param only_explicit_host_device_data_transfers: Flag to set host<->device + transfers mode. If set *True*, the user has to invoke all the + host<->device transfers. If set *False* (default), Firedrake automatically + figures out the data transfers, however this might lead to sub-optimality. """ # name, env variable, type, default, write once DEFAULTS = { @@ -112,6 +116,7 @@ class Configuration(dict): "print_summary": ("PYOP2_PRINT_SUMMARY", bool, False), "matnest": ("PYOP2_MATNEST", bool, True), "block_sparsity": ("PYOP2_BLOCK_SPARSITY", bool, True), + "only_explicit_host_device_data_transfers": ("EXPLICIT_TRNSFRS", bool, False) } """Default values for PyOP2 configuration parameters""" From b0b60311b8d97a19c215e4373f968872f98bb125 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 15:07:12 -0500 Subject: [PATCH 05/12] conform petsc backends to pyop2 backend --- pyop2/gpu/cuda.py | 109 ++++++++++++++++++++++++-------------------- pyop2/petsc_base.py | 69 ++++++++++++++++++++++++++++ pyop2/sequential.py | 34 ++++++++++++++ 3 files changed, 162 insertions(+), 50 deletions(-) diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index e918f6efd..1c8d545e4 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -47,7 +47,8 @@ from pyop2.base import par_loop # noqa: F401 from pyop2.base import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 from pyop2.base import ALL -from pyop2.base import Map, MixedMap, Sparsity, Halo # noqa: F401 +from pyop2.base import MixedMap, Sparsity, Halo # noqa: F401 +from pyop2.base import Map as BaseMap from pyop2.base import Set, ExtrudedSet, MixedSet, Subset # noqa: F401 from pyop2.base import DatView # noqa: F401 from pyop2.base import Kernel # noqa: F401 @@ -60,38 +61,45 @@ from pyop2.exceptions import * # noqa: F401 from pyop2.mpi import collective from pyop2.profiling import timed_region, timed_function -from pyop2.utils import cached_property +from pyop2.utils import * from pyop2.configuration import configuration import loopy import numpy import pycuda.driver as cuda from pytools import memoize_method +from pyop2.petsc_base import AbstractPETScBackend from pyop2.logger import ExecTimeNoter -class Map(Map): - - @cached_property - def device_handle(self): - m_gpu = cuda.mem_alloc(int(self.values.nbytes)) - cuda.memcpy_htod(m_gpu, self.values) - return m_gpu +class Map(BaseMap): + """Map for CuDA""" + + def __init__(self, base_map): + assert type(base_map) == BaseMap + self._iterset = base_map._iterset + self._toset = base_map._toset + self.comm = base_map.comm + self._arity = base_map._arity + # maps indexed as `map[icell, idof]` + self._values = base_map._values + self._values_cuda = cuda.mem_alloc(int(self._values.nbytes)) + cuda.memcpy_htod(self._values_cuda, self._values) + self.shape = base_map.shape + self._name = 'cuda_copy_%d_of_%s' % (BaseMap._globalcount, base_map._name) + self._offset = base_map._offset + # A cache for objects built on top of this map + self._cache = {} + BaseMap._globalcount += 1 @cached_property def _kernel_args_(self): - return (self.device_handle, ) - - -class Arg(Arg): - """ - Arg for GPU. - """ + return (self._values_cuda, ) class ExtrudedSet(ExtrudedSet): """ - ExtrudedSet for GPU. + ExtrudedSet for CUDA. """ @cached_property def _kernel_args_(self): @@ -102,7 +110,7 @@ def _kernel_args_(self): class Subset(Subset): """ - Subset for GPU. + Subset for CUDA. """ @cached_property def _kernel_args_(self): @@ -126,7 +134,6 @@ class Dat(petsc_Dat): """ Dat for GPU. """ - @cached_property def _vec(self): assert self.dtype == PETSc.ScalarType, \ @@ -144,37 +151,6 @@ def _vec(self): return cuda_vec - @cached_property - def device_handle(self): - if self.dtype == PETSc.ScalarType: - with self.vec as v: - return v.getCUDAHandle() - else: - # handle for ex. facet numbers - m_gpu = cuda.mem_alloc(int(self._data.nbytes)) - cuda.memcpy_htod(m_gpu, self._data) - return m_gpu - - @cached_property - def _kernel_args_(self): - return (self.device_handle, ) - - @collective - @property - def data(self): - - with self.vec as v: - v.restoreCUDAHandle(self.device_handle) - return v.array - - ## TODO: fail when trying to acess elems from data_ro - @collective - @property - def data_ro(self): - with self.vec_ro as v: - v.restoreCUDAHandle(self.device_handle) - return v.array - class Global(petsc_Global): @@ -661,3 +637,36 @@ def insn_needs_atomic(insn): code = code.replace("inline void pyop2_kernel_uniform_extrusion", "__device__ inline void pyop2_kernel_uniform_extrusion") return code, program, args_to_make_global + + +class CUDABackend(AbstractPETScBackend): + Arg = Arg + ParLoop = ParLoop + Kernel = Kernel + Dat = Dat + READ = READ + WRITE = WRITE + RW = RW + INC = INC + MIN = MIN + MAX = MAX + Set = Set + ExtrudedSet = ExtrudedSet + MixedSet = MixedSet + Subset = Subset + DataSet = DataSet + MixedDataSet = MixedDataSet + Map = Map + MixedMap = MixedMap + Sparsity = Sparsity + Halo = Halo + Dat = Dat + MixedDat = MixedDat + DatView = DatView + Mat = Mat + Global = Global + GlobalDataSet = GlobalDataSet + PETScVecType = 'seqcuda' + + +cuda_backend = CUDABackend() diff --git a/pyop2/petsc_base.py b/pyop2/petsc_base.py index 4688f8864..5775f0fda 100644 --- a/pyop2/petsc_base.py +++ b/pyop2/petsc_base.py @@ -45,6 +45,8 @@ from pyop2.base import _make_object, Subset from pyop2.mpi import collective from pyop2.profiling import timed_region +from pyop2.configuration import configuration +from pyop2.backend import AbstractComputeBackend class DataSet(base.DataSet): @@ -359,6 +361,68 @@ def vec_context(self, access): if access is not base.READ: self.halo_valid = False + def _update_petsc_vec_type(self, vec, to_type): + from_type = vec.type + if from_type == to_type: + return + + unknown_conversion_err = NotImplementedError("Cannot convert petsc vec" + " type from '{}' to '{}'.".format(from_type, to_type)) + + if from_type == 'seq': + if to_type == 'seqcuda': + data = vec.array.copy() + vec.setType('seqcuda') + vec.setArray(data) + # FIXME: update '._data' + else: + raise unknown_conversion_err + elif from_type == 'seqcuda': + if to_type == 'seq': + size = self.dataset.layout_vec.getSizes() + #FIXME: This is probably more involved + self._data = self.data.copy() + vec.setType('seq') + vec.setArray(self._data[:size[0]]) + else: + raise unknown_conversion_err + else: + raise unknown_conversion_err + + def ensure_availability_on(self, backend): + with self.vec as petsc_vec: + self._update_petsc_vec_type(petsc_vec, + backend.PETScVecType) + + @property + def _kernel_args_(self): + from pyop2.op2 import compute_backend + with self.vec as petsc_vec: + if petsc_vec.type != compute_backend.PETScVecType: + if configuration['only_explicit_host_device_data_transfers']: + raise RuntimeError("Memory location mismatch for" + " '{}'.".format(self.name)) + else: + self.ensure_availability_on(compute_backend) + if petsc_vec.type == 'seq': + return (self._data.ctypes.data, ) + elif petsc_vec.type == 'seqcuda': + return (petsc_vec.getCUDAHandle(), ) + else: + raise NotImplementedError() + + @collective + @property + def data(self): + with self.vec as v: + if v.type == 'seq': + return v.array + elif v.type == 'seqcuda': + v.restoreCUDAHandle(v.getCUDAHandle()) + return v.array + else: + raise NotImplementedError("Unknown vec type %s." % v.type) + class MixedDat(base.MixedDat, VecAccessMixin): @utils.cached_property @@ -1066,3 +1130,8 @@ def duplicate(self, mat, copy=True): return _GlobalMat(self.global_.duplicate(), comm=mat.comm) else: return _GlobalMat(comm=mat.comm) + + +class AbstractPETScBackend(AbstractComputeBackend): + from pyop2.backend import _not_implemented + PETScVecType = _not_implemented() diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 6c78a005e..9b38075f2 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -57,6 +57,7 @@ from pyop2.mpi import collective from pyop2.profiling import timed_region from pyop2.utils import cached_property, get_petsc_dir +from pyop2.petsc_base import AbstractPETScBackend import loopy @@ -243,3 +244,36 @@ def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=Non code = loopy.generate_code_v2(wrapper) return code.device_code() + + +class SequentialCPUBackend(AbstractPETScBackend): + Arg = Arg + ParLoop = ParLoop + Kernel = Kernel + Dat = Dat + READ = READ + WRITE = WRITE + RW = RW + INC = INC + MIN = MIN + MAX = MAX + Set = Set + ExtrudedSet = ExtrudedSet + MixedSet = MixedSet + Subset = Subset + DataSet = DataSet + MixedDataSet = MixedDataSet + Map = Map + MixedMap = MixedMap + Sparsity = Sparsity + Halo = Halo + Dat = Dat + MixedDat = MixedDat + DatView = DatView + Mat = Mat + Global = Global + GlobalDataSet = GlobalDataSet + PETScVecType = 'seq' + + +sequential_cpu_backend = SequentialCPUBackend() From 3a5b6476d1ba70b5ab9833e7f23ea245d52f2c8b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 15:39:58 -0500 Subject: [PATCH 06/12] access descriptors should be backend agnostic --- pyop2/backend.py | 6 ------ pyop2/gpu/cuda.py | 6 ------ pyop2/op2.py | 4 +++- pyop2/sequential.py | 6 ------ 4 files changed, 3 insertions(+), 19 deletions(-) diff --git a/pyop2/backend.py b/pyop2/backend.py index c3299c2b6..5b089b5e3 100644 --- a/pyop2/backend.py +++ b/pyop2/backend.py @@ -7,12 +7,6 @@ class AbstractComputeBackend: ParLoop = _not_implemented() Kernel = _not_implemented() Dat = _not_implemented() - READ = _not_implemented() - WRITE = _not_implemented() - RW = _not_implemented() - INC = _not_implemented() - MIN = _not_implemented() - MAX = _not_implemented() Set = _not_implemented() ExtrudedSet = _not_implemented() MixedSet = _not_implemented() diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index 1c8d545e4..c081b86e8 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -644,12 +644,6 @@ class CUDABackend(AbstractPETScBackend): ParLoop = ParLoop Kernel = Kernel Dat = Dat - READ = READ - WRITE = WRITE - RW = RW - INC = INC - MIN = MIN - MAX = MAX Set = Set ExtrudedSet = ExtrudedSet MixedSet = MixedSet diff --git a/pyop2/op2.py b/pyop2/op2.py index 641bed183..054f5065d 100644 --- a/pyop2/op2.py +++ b/pyop2/op2.py @@ -40,9 +40,11 @@ from pyop2.mpi import MPI, COMM_WORLD, collective from pyop2.base import ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL # noqa: F401 +from pyop2.base import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 +# TODO: These imports shouldn't be imported from sequential +# i.e. all uses of op2.xyz should be replaces with op2.compute_backend.xyz import pyop2.sequential from pyop2.sequential import par_loop, Kernel # noqa: F401 -from pyop2.sequential import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 from pyop2.sequential import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet # noqa: F401 from pyop2.sequential import Map, MixedMap, Sparsity, Halo # noqa: F401 from pyop2.sequential import Dat, MixedDat, DatView, Mat # noqa: F401 diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 9b38075f2..754b8df91 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -251,12 +251,6 @@ class SequentialCPUBackend(AbstractPETScBackend): ParLoop = ParLoop Kernel = Kernel Dat = Dat - READ = READ - WRITE = WRITE - RW = RW - INC = INC - MIN = MIN - MAX = MAX Set = Set ExtrudedSet = ExtrudedSet MixedSet = MixedSet From fbcf54690f50adbf5067aa91cae389ba9c52d43d Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sat, 2 May 2020 18:43:46 -0500 Subject: [PATCH 07/12] Remove Arg, Kernel, Sparsity, Halo from ComputeBackend --- pyop2/backend.py | 5 ----- pyop2/base.py | 22 +++++++++++----------- pyop2/gpu/cuda.py | 5 ----- pyop2/op2.py | 6 +++--- pyop2/sequential.py | 5 ----- 5 files changed, 14 insertions(+), 29 deletions(-) diff --git a/pyop2/backend.py b/pyop2/backend.py index 5b089b5e3..034a34c2f 100644 --- a/pyop2/backend.py +++ b/pyop2/backend.py @@ -3,10 +3,7 @@ class _not_implemented: # noqa class AbstractComputeBackend: - Arg = _not_implemented() ParLoop = _not_implemented() - Kernel = _not_implemented() - Dat = _not_implemented() Set = _not_implemented() ExtrudedSet = _not_implemented() MixedSet = _not_implemented() @@ -15,8 +12,6 @@ class AbstractComputeBackend: MixedDataSet = _not_implemented() Map = _not_implemented() MixedMap = _not_implemented() - Sparsity = _not_implemented() - Halo = _not_implemented() Dat = _not_implemented() MixedDat = _not_implemented() DatView = _not_implemented() diff --git a/pyop2/base.py b/pyop2/base.py index 927767a6d..3d2831295 100644 --- a/pyop2/base.py +++ b/pyop2/base.py @@ -214,12 +214,12 @@ def __iter__(self): def split(self): """Split a mixed argument into a tuple of constituent arguments.""" if self._is_mixed_dat: - return tuple(_make_object('Arg', d, m, self._access) + return tuple(Arg(d, m, self._access) for d, m in zip(self.data, self._map)) elif self._is_mixed_mat: s = self.data.sparsity.shape mr, mc = self.map - return tuple(_make_object('Arg', self.data[i, j], (mr.split[i], mc.split[j]), + return tuple(Arg(self.data[i, j], (mr.split[i], mc.split[j]), self._access) for j in range(s[1]) for i in range(s[0])) else: @@ -1397,7 +1397,7 @@ def _wrapper_cache_key_(self): def __call__(self, access, path=None): if configuration["type_check"] and path and path.toset != self.dataset.set: raise MapValueError("To Set of Map does not match Set of Dat.") - return _make_object('Arg', data=self, map=path, access=access) + return Arg(data=self, map=path, access=access) def __getitem__(self, idx): """Return self if ``idx`` is 0, raise an error otherwise.""" @@ -1577,7 +1577,7 @@ def zero(self, subset=None): data = loopy.GlobalArg("dat", dtype=self.dtype, shape=(self.cdim,)) knl = loopy.make_function([domain], [insn], [data], name="zero") - knl = _make_object('Kernel', knl, 'zero') + knl = Kernel(knl, 'zero') loop = _make_object('ParLoop', knl, iterset, self(WRITE)) @@ -1611,7 +1611,7 @@ def _copy_parloop(self, other, subset=None): loopy.GlobalArg("other", dtype=other.dtype, shape=(other.cdim,))] knl = loopy.make_function([domain], [insn], data, name="copy") - self._copy_kernel = _make_object('Kernel', knl, 'copy') + self._copy_kernel = Kernel(knl, 'copy') return _make_object('ParLoop', self._copy_kernel, subset or self.dataset.set, self(READ), other(WRITE)) @@ -1664,7 +1664,7 @@ def _op(self, other, op): loopy.GlobalArg("other", dtype=other.dtype, shape=(other.cdim,)), loopy.GlobalArg("ret", dtype=self.dtype, shape=(self.cdim,))] knl = loopy.make_function([domain], [insn], data, name=name) - k = _make_object('Kernel', knl, name) + k = Kernel(knl, name) par_loop(k, self.dataset.set, self(READ), other(READ), ret(WRITE)) @@ -1693,7 +1693,7 @@ def _iop(self, other, op): data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,)), loopy.GlobalArg("other", dtype=other.dtype, shape=(other.cdim,))] knl = loopy.make_function([domain], [insn], data, name=name) - k = _make_object('Kernel', knl, name) + k = Kernel(knl, name) par_loop(k, self.dataset.set, self(INC), other(READ)) @@ -1715,7 +1715,7 @@ def _uop(self, op): insn = loopy.Assignment(_self.index(i), _op(_self.index(i)), within_inames=frozenset(["i"])) data = [loopy.GlobalArg("self", dtype=self.dtype, shape=(self.cdim,))] knl = loopy.make_function([domain], [insn], data, name=name) - k = _make_object('Kernel', knl, name) + k = Kernel(knl, name) par_loop(k, self.dataset.set, self(RW)) return self @@ -1746,7 +1746,7 @@ def inner(self, other): loopy.GlobalArg("ret", dtype=ret.dtype, shape=(1,))] knl = loopy.make_function([domain], [insn], data, name="inner") - k = _make_object('Kernel', knl, "inner") + k = Kernel(knl, "inner") par_loop(k, self.dataset.set, self(READ), other(READ), ret(INC)) return ret.data_ro[0] @@ -2288,7 +2288,7 @@ def _wrapper_cache_key_(self): @validate_in(('access', _modes, ModeValueError)) def __call__(self, access, path=None): - return _make_object('Arg', data=self, access=access) + return Arg(data=self, access=access) def __iter__(self): """Yield self when iterated over.""" @@ -3123,7 +3123,7 @@ def __call__(self, access, path, lgmaps=None, unroll_map=False): path_maps = as_tuple(path, Map, 2) if configuration["type_check"] and tuple(path_maps) not in self.sparsity: raise MapValueError("Path maps not in sparsity maps") - return _make_object('Arg', data=self, map=path_maps, access=access, lgmaps=lgmaps, unroll_map=unroll_map) + return Arg(data=self, map=path_maps, access=access, lgmaps=lgmaps, unroll_map=unroll_map) @cached_property def _wrapper_cache_key_(self): diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index c081b86e8..9f9e17527 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -640,10 +640,7 @@ def insn_needs_atomic(insn): class CUDABackend(AbstractPETScBackend): - Arg = Arg ParLoop = ParLoop - Kernel = Kernel - Dat = Dat Set = Set ExtrudedSet = ExtrudedSet MixedSet = MixedSet @@ -652,8 +649,6 @@ class CUDABackend(AbstractPETScBackend): MixedDataSet = MixedDataSet Map = Map MixedMap = MixedMap - Sparsity = Sparsity - Halo = Halo Dat = Dat MixedDat = MixedDat DatView = DatView diff --git a/pyop2/op2.py b/pyop2/op2.py index 054f5065d..f70279802 100644 --- a/pyop2/op2.py +++ b/pyop2/op2.py @@ -41,12 +41,12 @@ from pyop2.base import ON_BOTTOM, ON_TOP, ON_INTERIOR_FACETS, ALL # noqa: F401 from pyop2.base import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 +from pyop2.base import Kernel, Sparsity, Halo # noqa: F401 # TODO: These imports shouldn't be imported from sequential # i.e. all uses of op2.xyz should be replaces with op2.compute_backend.xyz -import pyop2.sequential -from pyop2.sequential import par_loop, Kernel # noqa: F401 +from pyop2.sequential import par_loop # noqa: F401 from pyop2.sequential import Set, ExtrudedSet, MixedSet, Subset, DataSet, MixedDataSet # noqa: F401 -from pyop2.sequential import Map, MixedMap, Sparsity, Halo # noqa: F401 +from pyop2.sequential import Map, MixedMap # noqa: F401 from pyop2.sequential import Dat, MixedDat, DatView, Mat # noqa: F401 from pyop2.sequential import Global, GlobalDataSet # noqa: F401 from pyop2.sequential import ParLoop as SeqParLoop # noqa: F401 diff --git a/pyop2/sequential.py b/pyop2/sequential.py index 754b8df91..17b53fe4c 100644 --- a/pyop2/sequential.py +++ b/pyop2/sequential.py @@ -247,10 +247,7 @@ def generate_single_cell_wrapper(iterset, args, forward_args=(), kernel_name=Non class SequentialCPUBackend(AbstractPETScBackend): - Arg = Arg ParLoop = ParLoop - Kernel = Kernel - Dat = Dat Set = Set ExtrudedSet = ExtrudedSet MixedSet = MixedSet @@ -259,8 +256,6 @@ class SequentialCPUBackend(AbstractPETScBackend): MixedDataSet = MixedDataSet Map = Map MixedMap = MixedMap - Sparsity = Sparsity - Halo = Halo Dat = Dat MixedDat = MixedDat DatView = DatView From d0d4b4af52497aad41da372a446515dff2c11660 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Sun, 3 May 2020 12:33:20 -0500 Subject: [PATCH 08/12] cleaner imports --- pyop2/gpu/cuda.py | 55 ++++++++++++++++++----------------------------- 1 file changed, 21 insertions(+), 34 deletions(-) diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index 9f9e17527..321cde4ff 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -44,20 +44,6 @@ from pyop2 import base from pyop2 import compilation from pyop2 import petsc_base -from pyop2.base import par_loop # noqa: F401 -from pyop2.base import READ, WRITE, RW, INC, MIN, MAX # noqa: F401 -from pyop2.base import ALL -from pyop2.base import MixedMap, Sparsity, Halo # noqa: F401 -from pyop2.base import Map as BaseMap -from pyop2.base import Set, ExtrudedSet, MixedSet, Subset # noqa: F401 -from pyop2.base import DatView # noqa: F401 -from pyop2.base import Kernel # noqa: F401 -from pyop2.base import Arg # noqa: F401 -from pyop2.petsc_base import DataSet, MixedDataSet # noqa: F401 -from pyop2.petsc_base import GlobalDataSet # noqa: F401 -from pyop2.petsc_base import Dat as petsc_Dat -from pyop2.petsc_base import Global as petsc_Global -from pyop2.petsc_base import PETSc, MixedDat, Mat # noqa: F401 from pyop2.exceptions import * # noqa: F401 from pyop2.mpi import collective from pyop2.profiling import timed_region, timed_function @@ -68,15 +54,15 @@ import numpy import pycuda.driver as cuda from pytools import memoize_method -from pyop2.petsc_base import AbstractPETScBackend +from pyop2.petsc_base import PETSc, AbstractPETScBackend from pyop2.logger import ExecTimeNoter -class Map(BaseMap): +class Map(base.Map): """Map for CuDA""" def __init__(self, base_map): - assert type(base_map) == BaseMap + assert type(base_map) == base.Map self._iterset = base_map._iterset self._toset = base_map._toset self.comm = base_map.comm @@ -86,18 +72,18 @@ def __init__(self, base_map): self._values_cuda = cuda.mem_alloc(int(self._values.nbytes)) cuda.memcpy_htod(self._values_cuda, self._values) self.shape = base_map.shape - self._name = 'cuda_copy_%d_of_%s' % (BaseMap._globalcount, base_map._name) + self._name = 'cuda_copy_%d_of_%s' % (base.Map._globalcount, base_map._name) self._offset = base_map._offset # A cache for objects built on top of this map self._cache = {} - BaseMap._globalcount += 1 + base.Map._globalcount += 1 @cached_property def _kernel_args_(self): return (self._values_cuda, ) -class ExtrudedSet(ExtrudedSet): +class ExtrudedSet(base.ExtrudedSet): """ ExtrudedSet for CUDA. """ @@ -108,7 +94,7 @@ def _kernel_args_(self): return (m_gpu,) -class Subset(Subset): +class Subset(base.Subset): """ Subset for CUDA. """ @@ -119,10 +105,11 @@ def _kernel_args_(self): return self._superset._kernel_args_ + (m_gpu, ) -class DataSet(DataSet): +class DataSet(petsc_base.DataSet): @cached_property def layout_vec(self): """A PETSc Vec compatible with the dof layout of this DataSet.""" + 1/0 size = (self.size * self.cdim, None) vec = PETSc.Vec().create(comm=self.comm) vec.setSizes(size, bsize=self.cdim) @@ -130,7 +117,7 @@ def layout_vec(self): vec.setUp() return vec -class Dat(petsc_Dat): +class Dat(petsc_base.Dat): """ Dat for GPU. """ @@ -152,7 +139,7 @@ def _vec(self): return cuda_vec -class Global(petsc_Global): +class Global(petsc_base.Global): @cached_property def device_handle(self): @@ -195,7 +182,7 @@ def __init__(self, kernel, iterset, *args, **kwargs): self._fun = None self._iterset = iterset self._args = args - self._iteration_region = kwargs.get('iterate', ALL) + self._iteration_region = kwargs.get('iterate', base.ALL) self._pass_layer_arg = kwargs.get('pass_layer_arg', False) # Copy the class variables, so we don't overwrite them self._cppargs = dcopy(type(self)._cppargs) @@ -423,7 +410,7 @@ def prepare_arglist(self, iterset, *args): arglist = iterset._kernel_args_ for arg in args: arglist += arg._kernel_args_ - if arg.access is INC: + if arg.access is base.INC: nbytes += arg.data.nbytes * 2 else: nbytes += arg.data.nbytes @@ -641,20 +628,20 @@ def insn_needs_atomic(insn): class CUDABackend(AbstractPETScBackend): ParLoop = ParLoop - Set = Set + Set = base.Set ExtrudedSet = ExtrudedSet - MixedSet = MixedSet + MixedSet = base.MixedSet Subset = Subset DataSet = DataSet - MixedDataSet = MixedDataSet + MixedDataSet = petsc_base.MixedDataSet Map = Map - MixedMap = MixedMap + MixedMap = base.MixedMap Dat = Dat - MixedDat = MixedDat - DatView = DatView - Mat = Mat + MixedDat = petsc_base.MixedDat + DatView = base.DatView + Mat = petsc_base.Mat Global = Global - GlobalDataSet = GlobalDataSet + GlobalDataSet = petsc_base.GlobalDataSet PETScVecType = 'seqcuda' From cc7bd4fcda10e85339dea5fbea74092b30dfe236 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 4 May 2020 03:40:29 -0500 Subject: [PATCH 09/12] backend dependent DataSet.layout_vec --- pyop2/gpu/cuda.py | 3 +-- pyop2/petsc_base.py | 39 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 38 insertions(+), 4 deletions(-) diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index 321cde4ff..6592cca72 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -107,9 +107,8 @@ def _kernel_args_(self): class DataSet(petsc_base.DataSet): @cached_property - def layout_vec(self): + def _layout_vec(self): """A PETSc Vec compatible with the dof layout of this DataSet.""" - 1/0 size = (self.size * self.cdim, None) vec = PETSc.Vec().create(comm=self.comm) vec.setSizes(size, bsize=self.cdim) diff --git a/pyop2/petsc_base.py b/pyop2/petsc_base.py index 5775f0fda..3cae7e1dd 100644 --- a/pyop2/petsc_base.py +++ b/pyop2/petsc_base.py @@ -124,17 +124,52 @@ def local_ises(self): return tuple(ises) @utils.cached_property - def layout_vec(self): - """A PETSc Vec compatible with the dof layout of this DataSet.""" + def _layout_vec(self): vec = PETSc.Vec().create(comm=self.comm) size = (self.size * self.cdim, None) vec.setSizes(size, bsize=self.cdim) vec.setUp() return vec + def _update_petsc_vec_type(self, vec): + from pyop2.op2 import compute_backend + to_type = compute_backend.PETScVecType + from_type = vec.type + if from_type == to_type: + return + + unknown_conversion_err = NotImplementedError("Cannot convert petsc vec" + " type from '{}' to '{}'.".format(from_type, to_type)) + + if from_type == 'seq': + if to_type == 'seqcuda': + data = vec.array.copy() + vec.setType('seqcuda') + vec.setArray(data) + else: + raise unknown_conversion_err + elif from_type == 'seqcuda': + if to_type == 'seq': + vec.restoreCUDAHandle(vec.getCUDAHandle()) + data = vec.array.copy() + vec.setType('seq') + vec.setArray(data) + else: + raise unknown_conversion_err + else: + raise unknown_conversion_err + + @property + def layout_vec(self): + """A PETSc Vec compatible with the dof layout of this DataSet.""" + self._update_petsc_vec_type(self._layout_vec) + return self._layout_vec + @utils.cached_property def dm(self): dm = PETSc.DMShell().create(comm=self.comm) + # FIXME: Ownership of 'dm' is not properly tied to DataSet in + # firedrake, this might lead to compute backend mismatch errors. dm.setGlobalVector(self.layout_vec) return dm From da5d4c779260e602d2aec45c5ce99d9100c9b306 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 4 May 2020 06:45:36 -0500 Subject: [PATCH 10/12] enforce ensure availability on petsc vec rather than _kernel_args_ --- pyop2/gpu/cuda.py | 7 +++--- pyop2/petsc_base.py | 61 ++++++++++++++++++++++++--------------------- 2 files changed, 35 insertions(+), 33 deletions(-) diff --git a/pyop2/gpu/cuda.py b/pyop2/gpu/cuda.py index 6592cca72..e17b577b4 100644 --- a/pyop2/gpu/cuda.py +++ b/pyop2/gpu/cuda.py @@ -112,7 +112,7 @@ def _layout_vec(self): size = (self.size * self.cdim, None) vec = PETSc.Vec().create(comm=self.comm) vec.setSizes(size, bsize=self.cdim) - vec.setType('cuda') + vec.setType('seqcuda') vec.setUp() return vec @@ -129,11 +129,10 @@ def _vec(self): # But use getSizes to save an Allreduce in computing the # global size. size = self.dataset.layout_vec.getSizes() - data = self._data[:size[0]] cuda_vec = PETSc.Vec().create(self.comm) cuda_vec.setSizes(size=size, bsize=self.cdim) - cuda_vec.setType('cuda') - cuda_vec.setArray(data) + cuda_vec.setType('seqcuda') + cuda_vec.setArray(self._data[:size[0]]) return cuda_vec diff --git a/pyop2/petsc_base.py b/pyop2/petsc_base.py index 3cae7e1dd..634d781c9 100644 --- a/pyop2/petsc_base.py +++ b/pyop2/petsc_base.py @@ -382,20 +382,6 @@ def _vec(self): data = self._data[:size[0]] return PETSc.Vec().createWithArray(data, size=size, bsize=self.cdim, comm=self.comm) - @contextmanager - def vec_context(self, access): - r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`. - - :param access: Access descriptor: READ, WRITE, or RW.""" - # PETSc Vecs have a state counter and cache norm computations - # to return immediately if the state counter is unchanged. - # Since we've updated the data behind their back, we need to - # change that state counter. - self._vec.stateIncrease() - yield self._vec - if access is not base.READ: - self.halo_valid = False - def _update_petsc_vec_type(self, vec, to_type): from_type = vec.type if from_type == to_type: @@ -406,17 +392,19 @@ def _update_petsc_vec_type(self, vec, to_type): if from_type == 'seq': if to_type == 'seqcuda': - data = vec.array.copy() + # FIXME: Why is the reshape needed? + size = self.dataset.layout_vec.getSizes() + self._data = vec.array.reshape(self._data.shape).copy() vec.setType('seqcuda') - vec.setArray(data) - # FIXME: update '._data' + vec.setArray(self._data[:size[0]]) else: raise unknown_conversion_err elif from_type == 'seqcuda': if to_type == 'seq': + # FIXME: Why is the reshape needed? size = self.dataset.layout_vec.getSizes() - #FIXME: This is probably more involved - self._data = self.data.copy() + vec.restoreCUDAHandle(vec.getCUDAHandle()) + self._data = vec.array.reshape(self._data.shape).copy() vec.setType('seq') vec.setArray(self._data[:size[0]]) else: @@ -425,20 +413,35 @@ def _update_petsc_vec_type(self, vec, to_type): raise unknown_conversion_err def ensure_availability_on(self, backend): - with self.vec as petsc_vec: - self._update_petsc_vec_type(petsc_vec, - backend.PETScVecType) + self._update_petsc_vec_type(self._vec, + backend.PETScVecType) + + @contextmanager + def vec_context(self, access): + r"""A context manager for a :class:`PETSc.Vec` from a :class:`Dat`. + + :param access: Access descriptor: READ, WRITE, or RW.""" + # PETSc Vecs have a state counter and cache norm computations + # to return immediately if the state counter is unchanged. + # Since we've updated the data behind their back, we need to + # change that state counter. + + from pyop2.op2 import compute_backend + self._vec.stateIncrease() + if self._vec.type != compute_backend.PETScVecType: + if configuration['only_explicit_host_device_data_transfers']: + raise RuntimeError("Memory location mismatch for" + " '{}'.".format(self.name)) + else: + self.ensure_availability_on(compute_backend) + + yield self._vec + if access is not base.READ: + self.halo_valid = False @property def _kernel_args_(self): - from pyop2.op2 import compute_backend with self.vec as petsc_vec: - if petsc_vec.type != compute_backend.PETScVecType: - if configuration['only_explicit_host_device_data_transfers']: - raise RuntimeError("Memory location mismatch for" - " '{}'.".format(self.name)) - else: - self.ensure_availability_on(compute_backend) if petsc_vec.type == 'seq': return (self._data.ctypes.data, ) elif petsc_vec.type == 'seqcuda': From 0ae553fdb5ae9352e0d80db35f3d156d3d74a7ce Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 4 May 2020 09:39:19 -0500 Subject: [PATCH 11/12] restores cuda handles after every modification access --- pyop2/petsc_base.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pyop2/petsc_base.py b/pyop2/petsc_base.py index 634d781c9..1ae9f57f2 100644 --- a/pyop2/petsc_base.py +++ b/pyop2/petsc_base.py @@ -439,6 +439,9 @@ def vec_context(self, access): if access is not base.READ: self.halo_valid = False + if self._vec.type == 'seqcuda': + self._vec.restoreCUDAHandle(self._vec.getCUDAHandle()) + @property def _kernel_args_(self): with self.vec as petsc_vec: From 56b44f6e3532571720fbdd8b5514803e89867198 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 4 May 2020 22:31:17 -0500 Subject: [PATCH 12/12] docs --- pyop2/backend.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyop2/backend.py b/pyop2/backend.py index 034a34c2f..67b9daaef 100644 --- a/pyop2/backend.py +++ b/pyop2/backend.py @@ -3,6 +3,10 @@ class _not_implemented: # noqa class AbstractComputeBackend: + """ + Abstract class to record all the backend specific implementation of + :mod:`pyop2`'s data structures. + """ ParLoop = _not_implemented() Set = _not_implemented() ExtrudedSet = _not_implemented()