diff --git a/nutils/evaluable.py b/nutils/evaluable.py
index 119d3f602..a36a6393c 100644
--- a/nutils/evaluable.py
+++ b/nutils/evaluable.py
@@ -330,6 +330,27 @@ def _optimized_for_numpy1(obj):
     def _optimized_for_numpy(self):
         return
 
+    @util.shallow_replace
+    def _merge_inflates(self):
+        if isinstance(self, Inflate):
+            func = self.func
+            dofmaps = self.dofmap,
+            dofshape = self.dofmap.shape
+            lengths = self.length,
+            while True:
+                for axis, parts in func._inflations:
+                    if axis == func.ndim - len(dofshape) - 1 and len(parts) == 1:
+                        lengths = (func.shape[axis],) + lengths
+                        (dofmap, func), = parts.items()
+                        dofmaps = appendaxes(dofmap, dofshape), *[prependaxes(d, dofmap.shape) for d in dofmaps]
+                        dofshape = dofmap.shape + dofshape
+                        break # continue the outer while loop
+                else:
+                    break
+            # merge inner inflates
+            func = func._merge_inflates()
+            return _MultiInflate(func, dofmaps, lengths)
+
     @cached_property
     def _loop_deps(self):
         deps = util.IDSet()
@@ -969,6 +990,11 @@ def _unaligned(self):
     _diagonals = ()
     _inflations = ()
 
+    @property
+    def _as_range_with_offset(self):
+        if self.ndim == 1 and _equals_scalar_constant(self.shape[0], 1):
+            return self.shape[0], _OfnTakeUnit(self, constant(0))
+
     def _derivative(self, var, seen):
         if self.dtype in (bool, int) or var not in self.arguments:
             return Zeros(self.shape + var.shape, dtype=self.dtype)
@@ -1690,6 +1716,12 @@ def _sum(self, axis):
                 summed = sum(multiply(*factors[:i], *factors[i+1:]), axis)
                 return summed * align(unaligned, [i-(i > axis) for i in where], summed.shape)
 
+    def _loopsum(self, index):
+        factors = tuple(self._factors)
+        for i, fi in enumerate(factors):
+            if index not in fi.arguments:
+                return fi * loop_sum(multiply(*factors[:i], *factors[i+1:]), index)
+
     def _add(self, other):
         factors = list(self._factors)
         other_factors = []
@@ -1937,6 +1969,16 @@ def _intbounds_impl(self):
         lowers, uppers = zip(*[f._intbounds for f in self._terms])
         return builtins.sum(lowers), builtins.sum(uppers)
 
+    @cached_property
+    def _as_range_with_offset(self):
+        if self.ndim != 1:
+            return
+        for func1, func2 in (self.funcs, reversed(tuple(self.funcs))):
+            if (length_offset1 := func1._as_range_with_offset) is not None and isinstance(func2, InsertAxis):
+                length, offset1 = length_offset1
+                offset2 = func2.func
+                return length, offset1 + offset2
+
 
 class Einsum(Array):
 
@@ -2000,6 +2042,10 @@ def _takediag(self, axis1, axis2):
         args_idx = tuple(tuple(ikeep if i == irm else i for i in idx) for idx in self.args_idx)
         return Einsum(self.args, args_idx, self.out_idx[:axis1] + self.out_idx[axis1+1:axis2] + self.out_idx[axis2+1:] + (ikeep,))
 
+    def _intbounds_impl(self):
+        lower = 0 if all(arg._intbounds[0] >= 0 for arg in self.args) else float('-inf')
+        return lower, float('inf')
+
 
 class Sum(Array):
 
@@ -2134,6 +2180,7 @@ class Take(Array):
     def __init__(self, func: Array, indices: Array):
         assert isinstance(func, Array) and func.ndim > 0, f'func={func!r}'
         assert isinstance(indices, Array) and indices.dtype == int, f'indices={indices!r}'
+        #assert isinstance(indices, Array) and indices.dtype == int and indices._intbounds[0] >= 0, f'indices={indices!r}'
         self.func = func
         self.indices = indices
         super().__init__(args=(func, indices), shape=func.shape[:-1]+indices.shape, dtype=func.dtype)
@@ -2153,6 +2200,13 @@ def _simplified(self):
             if axis == self.func.ndim - 1:
                 return util.sum(Inflate(func, dofmap, self.func.shape[-1])._take(self.indices, self.func.ndim - 1) for dofmap, func in parts.items())
 
+    def _optimized_for_numpy(self):
+        if self.indices.ndim == 0:
+            return _OfnTakeUnit(self.func, self.indices)
+        if (length_offset := self.indices._as_range_with_offset) is not None:
+            length, offset = length_offset
+            return _OfnTakeSlice(self.func, offset, length)
+
     @staticmethod
     def evalf(arr, indices):
         return arr[..., indices]
@@ -2182,6 +2236,48 @@ def _intbounds_impl(self):
         return self.func._intbounds
 
 
+class _OfnTakeUnit(Array):
+
+    def __init__(self, func: Array, index: Array):
+        assert isinstance(func, Array) and func.ndim > 0, f'func={func!r}'
+        assert isinstance(index, Array) and index.dtype == int and index.ndim == 0, f'index={index!r}'
+        #assert _isindex(index), f'index={index!r}'
+        self.func = func
+        self.index = index
+        super().__init__(args=(func, index), shape=func.shape[:-1], dtype=func.dtype)
+
+    def evalf(self, func, index):
+        return func[...,index]
+
+    def _compile_expression(self, add_const, func, index):
+        return func.get_item(_pyast.Tuple((_pyast.Raw('...'), index)))
+
+    def _intbounds_impl(self):
+        return self.func._intbounds
+
+
+class _OfnTakeSlice(Array):
+
+    def __init__(self, func: Array, start: Array, length: Array):
+        assert isinstance(func, Array) and func.ndim > 0, f'func={func!r}'
+        assert _isindex(start), f'start={start!r}'
+        assert _isindex(length), f'stop={stop!r}'
+        self.func = func
+        self.start = start
+        self.length = length
+        super().__init__(args=(func, start, length), shape=(*func.shape[:-1], length), dtype=func.dtype)
+
+    def evalf(self, func, start, length):
+        return func[...,start:start+length]
+
+    def _compile_expression(self, add_const, func, start, length):
+        stop = _pyast.BinOp(start, '+', length)
+        return func.get_item(_pyast.Tuple((_pyast.Raw('...'), _pyast.Raw('slice').call(start, stop))))
+
+    def _intbounds_impl(self):
+        return self.func._intbounds
+
+
 class Power(Array):
 
     def __init__(self, func: Array, power: Array):
@@ -3204,6 +3300,43 @@ def _intbounds_impl(self):
         return min(lower, 0), max(upper, 0)
 
 
+class _MultiInflate(Array):
+
+    def __init__(self, func: Array, dofmaps: typing.Tuple[Array, ...], lengths: typing.Tuple[Array, ...]):
+        assert isinstance(func, Array), f'func={func!r}'
+        assert isinstance(dofmaps, tuple) and all(isinstance(dofmap, Array) and dofmap.dtype == int for dofmap in dofmaps), f'dofmaps={dofmaps!r}'
+        assert isinstance(lengths, tuple) and all(map(_isindex, lengths)), f'lengths={lengths!r}'
+        assert len(dofmaps) == len(lengths)
+        self._ndofmaps = len(dofmaps)
+        self._npointwise = func.ndim - dofmaps[0].ndim
+        assert self._npointwise >= 0
+        assert all(dofmap.ndim == dofmaps[0].ndim and equalshape(dofmap.shape, func.shape[self._npointwise:]) for dofmap in dofmaps)
+        self.func = func
+        self.dofmaps = dofmaps
+        self.lengths = lengths
+        self._indices_head = (slice(None),) * self._npointwise
+        super().__init__(args=(func, *dofmaps, *lengths), shape=func.shape[:self._npointwise] + lengths, dtype=func.dtype)
+
+    def evalf(self, array, *args):
+        inflated = numpy.zeros(array.shape[:self._npointwise] + tuple(args[self._ndofmaps:]), dtype=self.dtype)
+        numpy.add.at(inflated, self._indices_head + tuple(args[:self._ndofmaps]), array)
+        return inflated
+
+    @cached_property
+    def _can_compile_iadd(self):
+        return True
+
+    def _compile_iadd(self, builder, out, zeroed_block_id):
+        indices = (_pyast.Raw('slice(None)'),) * self._npointwise
+        indices += tuple(builder.compile(self.dofmaps))
+        values = builder.compile(self.func)
+        builder.get_block_for_evaluable(self).array_add_at(out, _pyast.Tuple(indices), values)
+
+    def _intbounds_impl(self):
+        lower, upper = self.func._intbounds
+        return min(lower, 0), max(upper, 0)
+
+
 class SwapInflateTake(Evaluable):
 
     def __init__(self, inflateidx, takeidx):
@@ -3397,6 +3530,9 @@ def _simplified(self):
         if self.isconstant:
             return constant(self.eval())
 
+    def _intbounds_impl(self):
+        return 0, self.shape[0]._intbounds[1]
+
 
 class DerivativeTargetBase(Array):
     'base class for derivative targets'
@@ -3793,6 +3929,10 @@ def _intbounds_impl(self):
         assert lower >= 0
         return 0, max(0, upper - 1)
 
+    @cached_property
+    def _as_range_with_offset(self):
+        return self.length, zeros((), dtype=int)
+
 
 class InRange(Array):
 
@@ -3905,7 +4045,7 @@ def __init__(self, ncoeffs: Array, nvars: int) -> None:
         super().__init__(args=(ncoeffs,), shape=(), dtype=int)
 
     def evalf(self, ncoeffs):
-        return numpy.array(poly.degree(self.nvars, ncoeffs.__index__()))
+        return numpy.int_(poly.degree(self.nvars, ncoeffs.__index__()))
 
     def _compile_expression(self, add_constant, ncoeffs):
         ncoeffs = ncoeffs.get_attr('__index__').call()
@@ -3949,7 +4089,7 @@ def __init__(self, nvars: int, degree: Array) -> None:
         super().__init__(args=(degree,), shape=(), dtype=int)
 
     def evalf(self, degree):
-        return numpy.array(poly.ncoeffs(self.nvars, degree.__index__()))
+        return numpy.int_(poly.ncoeffs(self.nvars, degree.__index__()))
 
     def _compile_expression(self, add_constant, degree):
         degree = degree.get_attr('__index__').call()
@@ -4733,12 +4873,12 @@ def _add(self, other):
         if isinstance(other, LoopSum) and other.index == self.index:
             return loop_sum(self.func + other.func, self.index)
 
-    def _multiply(self, other):
-        # If `other` depends on `self.index`, e.g. because `self` is the inner
-        # loop of two nested `LoopSum`s over the same index, then we should not
-        # move `other` inside this loop.
-        if self.index not in other.arguments:
-            return loop_sum(self.func * other, self.index)
+    #def _multiply(self, other):
+    #    # If `other` depends on `self.index`, e.g. because `self` is the inner
+    #    # loop of two nested `LoopSum`s over the same index, then we should not
+    #    # move `other` inside this loop.
+    #    if self.index not in other.arguments:
+    #        return loop_sum(self.func * other, self.index)
 
     @cached_property
     def _assparse(self):
@@ -5667,6 +5807,7 @@ def compile(func, *, simplify: bool = True, _plot_stats: typing.Optional[bool] =
     # Simplify and optimize `funcs`.
     if simplify:
         funcs = [func.simplified for func in funcs]
+    funcs = [func._merge_inflates() for func in funcs]
     funcs = [func._optimized_for_numpy1 for func in funcs]
     funcs = _define_loop_block_structure(tuple(funcs))
 
diff --git a/nutils/function.py b/nutils/function.py
index 1e46dd35f..bf9ccad9f 100644
--- a/nutils/function.py
+++ b/nutils/function.py
@@ -2457,7 +2457,7 @@ def __init__(self, ndofs: int, nelems: int, index: Array, coords: Array) -> None
         arguments = _join_arguments((index.arguments, coords.arguments))
         super().__init__((ndofs,), float, spaces=index.spaces | coords.spaces, arguments=arguments)
 
-        _index = evaluable.Argument('_index', shape=(), dtype=int)
+        _index = evaluable.InRange(evaluable.Argument('_index', shape=(), dtype=int), evaluable.constant(self.nelems))
         self._arg_dofs_evaluable, self._arg_coeffs_evaluable = self.f_dofs_coeffs(_index)
         self._arg_ndofs_evaluable = evaluable.asarray(self._arg_dofs_evaluable.shape[0])
         assert self._arg_dofs_evaluable.ndim == 1
diff --git a/nutils/topology.py b/nutils/topology.py
index 644318f40..8c686efd2 100644
--- a/nutils/topology.py
+++ b/nutils/topology.py
@@ -2349,7 +2349,8 @@ def _asaffine(self, geom, arguments):
         geom_ = self.sample('uniform', n).eval(geom, **arguments) \
             .reshape(*self.shape, *[n] * self.ndims, self.ndims) \
             .transpose(*(i+j for i in range(self.ndims) for j in (0, self.ndims)), self.ndims*2) \
-            .reshape(*sampleshape, self.ndims)
+            .reshape(*sampleshape, self.ndims) \
+            .copy()
         # strategy: fit an affine plane through the minima and maxima of a uniform sample,
         # and evaluate the error as the largest difference on the remaining sample points
         xmin, xmax = geom_.reshape(-1, self.ndims)[[0, -1]]
diff --git a/tests/test_topology.py b/tests/test_topology.py
index 4ebcbf59e..d18f5ca61 100644
--- a/tests/test_topology.py
+++ b/tests/test_topology.py
@@ -703,7 +703,7 @@ def assertInterfaces(self, domain, geom, periodic, interfaces=None, elemindicato
             elemindicator = domain.basis('discont', degree=0)
         elemindicator = elemindicator.vector(domain.ndims)
         lhs = domain.integrate((elemindicator*f.grad(geom)[None]).sum(axis=1)*function.J(geom), ischeme='gauss2')
-        rhs = interfaces.integrate((-function.jump(elemindicator)*f*function.normal(geom)[None]).sum(axis=1)*function.J(geom), ischeme='gauss2')
+        rhs = interfaces.integrate((-function.jump(elemindicator)*f*function.normal(geom)[None]).sum(axis=1)*function.J(geom), ischeme='gauss2').copy()
         if len(domain.boundary):
             rhs += domain.boundary.integrate((elemindicator*f*function.normal(geom)[None]).sum(axis=1)*function.J(geom), ischeme='gauss2')
         numpy.testing.assert_array_almost_equal(lhs, rhs)