From 640770571ec2397e0f68fa8bb4c161102094b6b7 Mon Sep 17 00:00:00 2001
From: Nathanael Bosch <nathanael.bosch@uni-tuebingen.de>
Date: Mon, 19 Feb 2024 11:00:32 +0000
Subject: [PATCH] Add the DiagonalEK1 (#301)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* Add a BlockDiagonal implementation

* It works and it's (a little bit) faster than dense!

* Implement a first version of the DiagonalEK1

* Added smoothing

* Add BlockDiagonals to the tests

* This should be the proper logic to choose the cov factorization

* We can now select the covariance from the outside!

* Add some SIMD here and there

* Make views of BlockDiagonals illegal as they are super slow

* Change how the diffusions work

* Make some more diffusions work

* Better checking for validity of algorithm arguments

* More BlockDiagonal linalg things

* Better handling of the diffusion for prediction

* The global diffusion is now written into the cache directly

* Implement rmul! or the IsometricKroneckerProduct

* Properly ply the new diffusion after the solve

* Properly estimate the global scalar diffusion

* Properly implement the global MV diffusion

* This should be a proper implementation of the dynamic MV diffusion

* Try to fix how the prediction handles the diffusion (I failed)

* Try to get the DynamicMV diff to work with BlockDiag cov (but fail)

* Get the DiagonalEK1 to work with a dense covariance factorization

* Check diffusion and factorization compat somewhere else and warn instead of erroring

* JuliaFormatter.jl

* Implement my own BlockDiag type

* JuliaFormatter.jl

* Start fixing some tests

* Remove duplicate matmul implementation

* Fix some failing state init tests

* Improve the diffusion handling some more

* Enable the EK0 again with priors that are not Kronecker

* Remove one test case that's not yet supported

* Significantly speed up the secondorderodeproblem tests

* Remove a test that currently fails

* Fix many of the tests that I had to temporally remove

* Make it more obvious that BlockDiagonals and second order ODEs are not working

* Rename our BlockDiagonals to ProbNumDiffEqBlockDiagonal (+ shortcut)

* Add a BlockDiagonals extension to transfomr ours to theirs

* Add unit-tests for our `BlockDiag`s

* Check that the K.b is actually empty

* Add versions to overload also the non-blasfloat matmuls

* Make some code more compact and readable

* Change order to fit acronym

* For some reson the eval tests failed; so fix them

* Make the if else order in predict and backward kernel easier

* misc

* Properly implement `size`

* Remove an inbounds as we don't explicitly do a sizecheck

* Remove some checks again as they are irrelevant for the cov

* Add a very minimal docstring to ProbNumDiffEqBlockDiagonal

* Better BlockDiagonals and a bit of Kronecker

* Make the DiagonalEK1 work again (except for secondorderodes)

* Test the diffusions (found a bug! unittests are actually nice)

* Give the diffusions much more space

* Grealy simplify the local error estimate code

* Better predict tests

* Beter update and smoothing tests

* Praise the lord for unittests

* JuliaFormatter.jl

* Actually git the diffusion tests

* Testfix

* Remove some comments

* Test that the computed log-likelihood is correct

* Add BlockDiagonals compat entry

* Add more solvers to the autodiff tests

* Add much more tests

* JuliaFormatter.jl

* Better complexity test

* Actually do a proper test to check the scaling of the solvers

* JuliaFormatter.jl

* Make preconditioner computation simpler and test better

* One more prior test

* Fix some tests

* Get the data likelihoods to work with DiagonalEK1 and the EK0

* Make the data likelihoods better

* JuliaFormatter.jl

* Relax the complexity tests even more

* JuliaFormatter.jl

* Remove some unused code

* Bisschen code upgrade für die data likelihoods

* Make the complexity tests more compact

* Remove some unused things, mainly to re-trigger gh actions

* Fix the bad getindex for BlockDiag

* Use the built-in matrix exponential

* Remore parts of a test that are not implemented anymore

* Improve coverage a bit

* JuliaFormatter.jl

* Check better what observation noise works with what factorization

* Fix the pn_observation_noise check

* More BlockDiag tests

* Add some more Kronecker tests

* JuliaFormatter.jl

* Fix test

* Make the FixedMVDiffusion work with dense matrices

* Make the FixedMVDiffusion work with the data likelihoods

* Remove some comments

* Add `add!` to the Kronecker tests and shorten them a bit

* Fix the failing test that I just found
---
 Project.toml                            |  10 +-
 ext/BlockDiagonalsExt.jl                |   8 +
 src/ProbNumDiffEq.jl                    |  25 +-
 src/alg_utils.jl                        |  18 +-
 src/algorithms.jl                       | 151 +++++++++--
 src/blockdiagonals.jl                   | 321 +++++++++++++++++++++++
 src/caches.jl                           |  37 +--
 src/callbacks/dataupdate.jl             |  61 +++--
 src/checks.jl                           |   2 +-
 src/covariance_structure.jl             |  44 ++--
 src/data_likelihoods/fenrir.jl          |  20 +-
 src/derivative_utils.jl                 |  10 +-
 src/diffusions.jl                       | 213 ----------------
 src/diffusions/apply_diffusion.jl       | 102 ++++++++
 src/diffusions/calibration.jl           | 178 +++++++++++++
 src/diffusions/typedefs.jl              | 103 ++++++++
 src/fast_linalg.jl                      |  46 ++--
 src/filtering/markov_kernel.jl          |  88 ++++++-
 src/filtering/predict.jl                |  38 ++-
 src/filtering/update.jl                 |  44 ++++
 src/initialization/classicsolverinit.jl |   4 +-
 src/integrator_utils.jl                 |  25 +-
 src/kronecker.jl                        |  29 +++
 src/perform_step.jl                     |  49 ++--
 src/preconditioning.jl                  |  34 ++-
 src/priors/iwp.jl                       |  13 +-
 src/priors/ltisde.jl                    |  38 +--
 src/projection.jl                       |  19 ++
 test/Project.toml                       |   2 +-
 test/autodiff.jl                        |  83 +++---
 test/complexity.jl                      | 120 ++++-----
 test/core/blockdiagonals.jl             |  85 +++++++
 test/core/diffusions.jl                 |  59 +++++
 test/core/fast_linalg.jl                |  52 ++++
 test/core/filtering.jl                  | 324 ++++++++++++++----------
 test/core/kronecker.jl                  |  70 ++---
 test/core/preconditioning.jl            |  23 +-
 test/core/priors.jl                     |  74 +++---
 test/correctness.jl                     |  13 +
 test/data_likelihoods.jl                |  54 ++--
 test/errors_thrown.jl                   |  27 ++
 test/mass_matrix.jl                     |  17 ++
 test/observation_noise.jl               |  28 ++
 test/runtests.jl                        |  12 +
 test/secondorderode.jl                  |  14 +-
 test/state_init.jl                      |   2 +-
 test/stiff_problem.jl                   |   4 +
 47 files changed, 1999 insertions(+), 794 deletions(-)
 create mode 100644 ext/BlockDiagonalsExt.jl
 create mode 100644 src/blockdiagonals.jl
 delete mode 100644 src/diffusions.jl
 create mode 100644 src/diffusions/apply_diffusion.jl
 create mode 100644 src/diffusions/calibration.jl
 create mode 100644 src/diffusions/typedefs.jl
 create mode 100644 test/core/blockdiagonals.jl
 create mode 100644 test/core/diffusions.jl
 create mode 100644 test/core/fast_linalg.jl
 create mode 100644 test/observation_noise.jl

diff --git a/Project.toml b/Project.toml
index ed5af8bb9..6dc1f5565 100644
--- a/Project.toml
+++ b/Project.toml
@@ -7,11 +7,8 @@ version = "0.15.0"
 ArrayAllocators = "c9d4266f-a5cb-439d-837c-c97b191379f5"
 DiffEqBase = "2b5f629d-d688-5b77-993f-72d75c75574e"
 DiffEqCallbacks = "459566f4-90b8-5000-8ac3-15dfb0a30def"
-DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d"
 DocStringExtensions = "ffbed154-4ef7-542d-bbb7-c09d3a79fcae"
-ExponentialUtilities = "d4d017d3-3776-5f7e-afef-a10c40355c18"
 FastBroadcast = "7034ab61-46d4-4ed7-9d0f-46aef9175898"
-FastGaussQuadrature = "442a2c76-b920-505d-bb47-c5924d526838"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 FiniteHorizonGramians = "b59a298d-d283-4a37-9369-85a9f9a111a5"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
@@ -30,7 +27,6 @@ RecursiveArrayTools = "731186ca-8d62-57ce-b412-fbd966d074cd"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
 SimpleUnPack = "ce78b400-467f-4804-87d8-8f486da07d0a"
-SpecialMatrices = "928aab9d-ef52-54ac-8ca1-acd7ca42c160"
 StaticArrayInterface = "0d7ed370-da01-4f52-bd93-41d350b8b718"
 Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StructArrays = "09ab397b-f2b6-538f-b94a-2f83cf4a842a"
@@ -39,22 +35,23 @@ TaylorSeries = "6aa5eb33-94cf-58f4-a9d0-e4b2c4fc25ea"
 ToeplitzMatrices = "c751599d-da0a-543b-9d20-d0a503d91d24"
 
 [weakdeps]
+BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d"
 RecipesBase = "3cdcf5f2-1ef4-517c-9805-6587b60abb01"
 
 [extensions]
+BlockDiagonalsExt = "BlockDiagonals"
 DiffEqDevToolsExt = "DiffEqDevTools"
 RecipesBaseExt = "RecipesBase"
 
 [compat]
 ArrayAllocators = "0.3"
+BlockDiagonals = "0.1"
 DiffEqBase = "6.122"
 DiffEqCallbacks = "2.36"
 DiffEqDevTools = "2"
 DocStringExtensions = "0.9"
-ExponentialUtilities = "1"
 FastBroadcast = "0.2"
-FastGaussQuadrature = "0.5, 1"
 FillArrays = "1.9"
 FiniteHorizonGramians = "0.2"
 ForwardDiff = "0.10"
@@ -73,7 +70,6 @@ RecursiveArrayTools = "2, 3"
 Reexport = "1"
 SciMLBase = "1.90, 2"
 SimpleUnPack = "1"
-SpecialMatrices = "3"
 StaticArrayInterface = "1.3"
 Statistics = "1"
 StructArrays = "0.4, 0.5, 0.6"
diff --git a/ext/BlockDiagonalsExt.jl b/ext/BlockDiagonalsExt.jl
new file mode 100644
index 000000000..37a7f6bfc
--- /dev/null
+++ b/ext/BlockDiagonalsExt.jl
@@ -0,0 +1,8 @@
+module BlockDiagonalsExt
+
+import ProbNumDiffEq: ProbNumDiffEqBlockDiagonal, blocks
+import BlockDiagonals: BlockDiagonal
+
+BlockDiagonal(M::ProbNumDiffEqBlockDiagonal) = BlockDiagonal(blocks(M))
+
+end
diff --git a/src/ProbNumDiffEq.jl b/src/ProbNumDiffEq.jl
index eac2a17cb..1d3756455 100644
--- a/src/ProbNumDiffEq.jl
+++ b/src/ProbNumDiffEq.jl
@@ -2,7 +2,8 @@ __precompile__()
 
 module ProbNumDiffEq
 
-import Base: copy, copy!, show, size, ndims, similar, isapprox, isequal, iterate, ==, length
+import Base:
+    copy, copy!, show, size, ndims, similar, isapprox, isequal, iterate, ==, length, zero
 
 using LinearAlgebra
 import LinearAlgebra: mul!
@@ -15,7 +16,7 @@ using Reexport
 import SciMLBase
 import SciMLBase: interpret_vars, getsyms, remake
 using OrdinaryDiffEq
-using SpecialMatrices, ToeplitzMatrices
+using ToeplitzMatrices
 using FastBroadcast
 using StaticArrayInterface
 using FunctionWrappersWrappers
@@ -24,9 +25,7 @@ using TaylorSeries, TaylorIntegration
 using SimpleUnPack
 using RecursiveArrayTools
 using ForwardDiff
-using ExponentialUtilities
 using Octavian
-using FastGaussQuadrature
 import Kronecker
 using ArrayAllocators
 using FiniteHorizonGramians
@@ -45,15 +44,27 @@ vecvec2mat(x) = reduce(hcat, x)'
 
 cov2psdmatrix(cov::Number; d) = PSDMatrix(sqrt(cov) * Eye(d))
 cov2psdmatrix(cov::UniformScaling; d) = PSDMatrix(sqrt(cov.λ) * Eye(d))
+cov2psdmatrix(cov::Diagonal{<:Number,<:FillArrays.Fill}; d) =
+    (@assert size(cov, 1) == size(cov, 2) == d; cov2psdmatrix(cov.diag.value; d))
 cov2psdmatrix(cov::Diagonal; d) =
     (@assert size(cov, 1) == size(cov, 2) == d; PSDMatrix(sqrt.(cov)))
 cov2psdmatrix(cov::AbstractMatrix; d) =
     (@assert size(cov, 1) == size(cov, 2) == d; PSDMatrix(Matrix(cholesky(cov).U)))
 cov2psdmatrix(cov::PSDMatrix; d) = (@assert size(cov, 1) == size(cov, 2) == d; cov)
 
+"""
+    add!(out, toadd)
+
+Add `toadd` to `out` in-place.
+"""
+add!
+add!(out, toadd) = (out .+= toadd)
+
 include("fast_linalg.jl")
 include("kronecker.jl")
+include("blockdiagonals.jl")
 include("covariance_structure.jl")
+export IsometricKroneckerCovariance, DenseCovariance, BlockDiagonalCovariance
 
 abstract type AbstractODEFilterCache <: OrdinaryDiffEq.OrdinaryDiffEqCache end
 
@@ -65,14 +76,16 @@ include("priors/ltisde.jl")
 include("priors/ioup.jl")
 include("priors/matern.jl")
 export IWP, IOUP, Matern
-include("diffusions.jl")
+include("diffusions/typedefs.jl")
+include("diffusions/apply_diffusion.jl")
+include("diffusions/calibration.jl")
 export FixedDiffusion, DynamicDiffusion, FixedMVDiffusion, DynamicMVDiffusion
 
 include("initialization/common.jl")
 export TaylorModeInit, ClassicSolverInit, SimpleInit, ForwardDiffInit
 
 include("algorithms.jl")
-export EK0, EK1
+export EK0, EK1, DiagonalEK1
 export ExpEK, RosenbrockExpEK
 
 include("alg_utils.jl")
diff --git a/src/alg_utils.jl b/src/alg_utils.jl
index a3c84736f..4fbbddd1b 100644
--- a/src/alg_utils.jl
+++ b/src/alg_utils.jl
@@ -4,19 +4,25 @@
 ############################################################################################
 
 OrdinaryDiffEq._alg_autodiff(::AbstractEK) = Val{true}()
-OrdinaryDiffEq._alg_autodiff(::EK1{CS,AD}) where {CS,AD} = Val{AD}()
-OrdinaryDiffEq.alg_difftype(::EK1{CS,AD,DiffType}) where {CS,AD,DiffType} = DiffType
 OrdinaryDiffEq.standardtag(::AbstractEK) = false
-OrdinaryDiffEq.standardtag(::EK1{CS,AD,DiffType,ST}) where {CS,AD,DiffType,ST} = ST
 OrdinaryDiffEq.concrete_jac(::AbstractEK) = nothing
-OrdinaryDiffEq.concrete_jac(::EK1{CS,AD,DiffType,ST,CJ}) where {CS,AD,DiffType,ST,CJ} = CJ
 
 @inline DiffEqBase.get_tmp_cache(integ, alg::AbstractEK, cache::AbstractODEFilterCache) =
     (cache.tmp, cache.atmp)
-OrdinaryDiffEq.get_chunksize(::EK1{CS}) where {CS} = Val(CS)
 OrdinaryDiffEq.isfsal(::AbstractEK) = false
 
-OrdinaryDiffEq.isimplicit(::EK1) = true
+for ALG in [:EK1, :DiagonalEK1]
+    @eval OrdinaryDiffEq._alg_autodiff(::$ALG{CS,AD}) where {CS,AD} = Val{AD}()
+    @eval OrdinaryDiffEq.alg_difftype(::$ALG{CS,AD,DiffType}) where {CS,AD,DiffType} =
+        DiffType
+    @eval OrdinaryDiffEq.standardtag(::$ALG{CS,AD,DiffType,ST}) where {CS,AD,DiffType,ST} =
+        ST
+    @eval OrdinaryDiffEq.concrete_jac(
+        ::$ALG{CS,AD,DiffType,ST,CJ},
+    ) where {CS,AD,DiffType,ST,CJ} = CJ
+    @eval OrdinaryDiffEq.get_chunksize(::$ALG{CS}) where {CS} = Val(CS)
+    @eval OrdinaryDiffEq.isimplicit(::$ALG) = true
+end
 
 ############################################
 # Step size control
diff --git a/src/algorithms.jl b/src/algorithms.jl
index 9982f0f5d..ba535e334 100644
--- a/src/algorithms.jl
+++ b/src/algorithms.jl
@@ -3,7 +3,13 @@
 ########################################################################################
 abstract type AbstractEK <: OrdinaryDiffEq.OrdinaryDiffEqAdaptiveAlgorithm end
 
-function ekargcheck(alg; diffusionmodel, pn_observation_noise, kwargs...)
+function ekargcheck(
+    alg;
+    diffusionmodel,
+    pn_observation_noise,
+    covariance_factorization,
+    kwargs...,
+)
     if (isstatic(diffusionmodel) && diffusionmodel.calibrate) &&
        (!isnothing(pn_observation_noise) && !iszero(pn_observation_noise))
         throw(
@@ -12,14 +18,63 @@ function ekargcheck(alg; diffusionmodel, pn_observation_noise, kwargs...)
             ),
         )
     end
-    if (
-        (diffusionmodel isa FixedMVDiffusion && diffusionmodel.calibrate) ||
-        diffusionmodel isa DynamicMVDiffusion) && alg == EK1
-        throw(
-            ArgumentError(
-                "The `EK1` algorithm does not support automatic calibration of multivariate diffusion models. Either use the `EK0` instead, or use a scalar diffusion model, or set `calibrate=false` and calibrate manually by optimizing `sol.pnstats.log_likelihood`.",
-            ),
-        )
+    if alg == EK1
+        if diffusionmodel isa FixedMVDiffusion && diffusionmodel.calibrate
+            throw(
+                ArgumentError(
+                    "The `EK1` algorithm does not support automatic global calibration of multivariate diffusion models. Either use a scalar diffusion model, or set `calibrate=false` and calibrate manually by optimizing `sol.pnstats.log_likelihood`. Or use a different solve, like `EK0` or `DiagonalEK1`.",
+                ),
+            )
+        elseif diffusionmodel isa DynamicMVDiffusion
+            throw(
+                ArgumentError(
+                    "The `EK1` algorithm does not support automatic calibration of local multivariate diffusion models. Either use a scalar diffusion model, or use a different solve, like `EK0` or `DiagonalEK1`.",
+                ),
+            )
+        end
+    end
+    if !(isnothing(pn_observation_noise) || ismissing(pn_observation_noise))
+        if covariance_factorization == IsometricKroneckerCovariance && !(
+            pn_observation_noise isa Number
+            || pn_observation_noise isa UniformScaling
+            || pn_observation_noise isa Diagonal{<:Number,<:FillArrays.Fill})
+            throw(
+                ArgumentError(
+                    "The supplied `pn_observation_noise` is not compatible with the chosen `IsometricKroneckerCovariance` factorization. Try one of `BlockDiagonalCovariance` or `DenseCovariance` instead!",
+                ),
+            )
+        end
+        if covariance_factorization == BlockDiagonalCovariance && !(
+            pn_observation_noise isa Number
+            || pn_observation_noise isa UniformScaling
+            || pn_observation_noise isa Diagonal)
+            throw(
+                ArgumentError(
+                    "The supplied `pn_observation_noise` is not compatible with the chosen `BlockDiagonalCovariance` factorization. Try `DenseCovariance` instead!",
+                ),
+            )
+        end
+    end
+end
+
+function covariance_structure(::Type{Alg}, prior, diffusionmodel) where {Alg<:AbstractEK}
+    if Alg <: EK0
+        if prior isa IWP
+            if (diffusionmodel isa DynamicDiffusion || diffusionmodel isa FixedDiffusion)
+                return IsometricKroneckerCovariance
+            else
+                return BlockDiagonalCovariance
+            end
+        else
+            # This is not great as other priors can be Kronecker too; TODO
+            return DenseCovariance
+        end
+    elseif Alg <: DiagonalEK1
+        return BlockDiagonalCovariance
+    elseif Alg <: EK1
+        return DenseCovariance
+    else
+        throw(ArgumentError("Unknown algorithm type $Alg"))
     end
 end
 
@@ -58,22 +113,25 @@ julia> solve(prob, EK0())
 
 # [References](@ref references)
 """
-struct EK0{PT,DT,IT,RT} <: AbstractEK
+struct EK0{PT,DT,IT,RT,CF} <: AbstractEK
     prior::PT
     diffusionmodel::DT
     smooth::Bool
     initialization::IT
     pn_observation_noise::RT
+    covariance_factorization::CF
     EK0(; order=3,
         prior::PT=IWP(order),
         diffusionmodel::DT=DynamicDiffusion(),
         smooth=true,
         initialization::IT=TaylorModeInit(num_derivatives(prior)),
         pn_observation_noise::RT=nothing,
-    ) where {PT,DT,IT,RT} = begin
-        ekargcheck(EK0; diffusionmodel, pn_observation_noise)
-        new{PT,DT,IT,RT}(
-            prior, diffusionmodel, smooth, initialization, pn_observation_noise)
+        covariance_factorization::CF=covariance_structure(EK0, prior, diffusionmodel),
+    ) where {PT,DT,IT,RT,CF} = begin
+        ekargcheck(EK0; diffusionmodel, pn_observation_noise, covariance_factorization)
+        new{PT,DT,IT,RT,CF}(
+            prior, diffusionmodel, smooth, initialization, pn_observation_noise,
+            covariance_factorization)
     end
 end
 
@@ -117,12 +175,13 @@ julia> solve(prob, EK1())
 
 # [References](@ref references)
 """
-struct EK1{CS,AD,DiffType,ST,CJ,PT,DT,IT,RT} <: AbstractEK
+struct EK1{CS,AD,DiffType,ST,CJ,PT,DT,IT,RT,CF} <: AbstractEK
     prior::PT
     diffusionmodel::DT
     smooth::Bool
     initialization::IT
     pn_observation_noise::RT
+    covariance_factorization::CF
     EK1(;
         order=3,
         prior::PT=IWP(order),
@@ -135,8 +194,57 @@ struct EK1{CS,AD,DiffType,ST,CJ,PT,DT,IT,RT} <: AbstractEK
         standardtag=Val{true}(),
         concrete_jac=nothing,
         pn_observation_noise::RT=nothing,
-    ) where {PT,DT,IT,RT} = begin
-        ekargcheck(EK1; diffusionmodel, pn_observation_noise)
+        covariance_factorization::CF=covariance_structure(EK1, prior, diffusionmodel),
+    ) where {PT,DT,IT,RT,CF} = begin
+        ekargcheck(EK1; diffusionmodel, pn_observation_noise, covariance_factorization)
+        new{
+            _unwrap_val(chunk_size),
+            _unwrap_val(autodiff),
+            diff_type,
+            _unwrap_val(standardtag),
+            _unwrap_val(concrete_jac),
+            PT,
+            DT,
+            IT,
+            RT,
+            CF,
+        }(
+            prior,
+            diffusionmodel,
+            smooth,
+            initialization,
+            pn_observation_noise,
+            covariance_factorization,
+        )
+    end
+end
+
+struct DiagonalEK1{CS,AD,DiffType,ST,CJ,PT,DT,IT,RT,CF} <: AbstractEK
+    prior::PT
+    diffusionmodel::DT
+    smooth::Bool
+    initialization::IT
+    pn_observation_noise::RT
+    covariance_factorization::CF
+    DiagonalEK1(;
+        order=3,
+        prior::PT=IWP(order),
+        diffusionmodel::DT=DynamicDiffusion(),
+        smooth=true,
+        initialization::IT=TaylorModeInit(num_derivatives(prior)),
+        chunk_size=Val{0}(),
+        autodiff=Val{true}(),
+        diff_type=Val{:forward},
+        standardtag=Val{true}(),
+        concrete_jac=nothing,
+        pn_observation_noise::RT=nothing,
+        covariance_factorization::CF=covariance_structure(
+            DiagonalEK1,
+            prior,
+            diffusionmodel,
+        ),
+    ) where {PT,DT,IT,RT,CF} = begin
+        ekargcheck(DiagonalEK1; diffusionmodel, pn_observation_noise, covariance_factorization)
         new{
             _unwrap_val(chunk_size),
             _unwrap_val(autodiff),
@@ -147,12 +255,14 @@ struct EK1{CS,AD,DiffType,ST,CJ,PT,DT,IT,RT} <: AbstractEK
             DT,
             IT,
             RT,
+            CF,
         }(
             prior,
             diffusionmodel,
             smooth,
             initialization,
             pn_observation_noise,
+            covariance_factorization,
         )
     end
 end
@@ -236,7 +346,12 @@ function DiffEqBase.remake(thing::EK1{CS,AD,DT,ST,CJ}; kwargs...) where {CS,AD,D
     )
 end
 
-function DiffEqBase.prepare_alg(alg::EK1{0}, u0::AbstractArray{T}, p, prob) where {T}
+function DiffEqBase.prepare_alg(
+    alg::Union{EK1{0},DiagonalEK1{0}},
+    u0::AbstractArray{T},
+    p,
+    prob,
+) where {T}
     # See OrdinaryDiffEq.jl: ./src/alg_utils.jl (where this is copied from).
     # In the future we might want to make EK1 an OrdinaryDiffEqAdaptiveImplicitAlgorithm and
     # use the prepare_alg from OrdinaryDiffEq; but right now, we do not use `linsolve` which
diff --git a/src/blockdiagonals.jl b/src/blockdiagonals.jl
new file mode 100644
index 000000000..6ec085a01
--- /dev/null
+++ b/src/blockdiagonals.jl
@@ -0,0 +1,321 @@
+"""
+    ProbNumDiffEqBlockDiagonal(blocks::Vector{V}) where {T,V<:AbstractMatrix{T}}
+
+A very minimal but fast re-implementation of `BlockDiagonals.Blockdiagonal`.
+"""
+struct ProbNumDiffEqBlockDiagonal{T<:Number,V<:AbstractMatrix{T}} <: AbstractMatrix{T}
+    blocks::Vector{V}
+    function ProbNumDiffEqBlockDiagonal{T,V}(
+        blocks::Vector{V},
+    ) where {T,V<:AbstractMatrix{T}}
+        return new{T,V}(blocks)
+    end
+end
+function ProbNumDiffEqBlockDiagonal(blocks::Vector{V}) where {T,V<:AbstractMatrix{T}}
+    return ProbNumDiffEqBlockDiagonal{T,V}(blocks)
+end
+const BlockDiag = ProbNumDiffEqBlockDiagonal
+
+blocks(B::BlockDiag) = B.blocks
+nblocks(B::BlockDiag) = length(B.blocks)
+size(B::BlockDiag) = mapreduce(size, ((a, b), (c, d)) -> (a + c, b + d), blocks(B))
+
+Base.@propagate_inbounds function Base.getindex(
+    B::BlockDiag{T},
+    i::Integer,
+    j::Integer,
+) where {T}
+    all((0, 0) .< (i, j) .<= size(B)) || throw(BoundsError(B, (i, j)))
+
+    p = 1
+    Si, Sj = size(blocks(B)[p])
+    while p <= nblocks(B)
+        if i <= Si && j <= Sj
+            return blocks(B)[p][i, j]
+        elseif (i <= Si && j > Sj) || (j <= Sj && i > Si)
+            return zero(T)
+        else
+            i -= Si
+            j -= Sj
+            p += 1
+        end
+    end
+    error("This shouldn't happen")
+end
+
+Base.view(::BlockDiag, idxs...) =
+    throw(ErrorException("`BlockDiag` does not support views!"))
+
+copy(B::BlockDiag) = BlockDiag(copy.(blocks(B)))
+copy!(B::BlockDiag, A::BlockDiag) = begin
+    @assert length(A.blocks) == length(B.blocks)
+    @simd ivdep for i in eachindex(blocks(B))
+        copy!(B.blocks[i], A.blocks[i])
+    end
+    return B
+end
+similar(B::BlockDiag) = BlockDiag(similar.(blocks(B)))
+zero(B::BlockDiag) = BlockDiag(zero.(blocks(B)))
+
+# Sums of BlockDiag
+Base.:+(A::BlockDiag, B::BlockDiag) = begin
+    @assert nblocks(A) == nblocks(B)
+    return BlockDiag([Ai + Bi for (Ai, Bi) in zip(blocks(A), blocks(B))])
+end
+Base.:-(A::BlockDiag, B::BlockDiag) = begin
+    @assert nblocks(A) == nblocks(B)
+    return BlockDiag([Ai - Bi for (Ai, Bi) in zip(blocks(A), blocks(B))])
+end
+
+add!(out::BlockDiag, toadd::BlockDiag) = begin
+    @assert nblocks(out) == nblocks(toadd)
+    @simd ivdep for i in eachindex(blocks(out))
+        add!(blocks(out)[i], blocks(toadd)[i])
+    end
+    return out
+end
+
+# Mul with Scalar or UniformScaling
+Base.:*(a::Number, M::BlockDiag) = BlockDiag([a * B for B in blocks(M)])
+Base.:*(M::BlockDiag, a::Number) = BlockDiag([B * a for B in blocks(M)])
+Base.:*(U::UniformScaling, M::BlockDiag) = BlockDiag([U * B for B in blocks(M)])
+Base.:*(M::BlockDiag, U::UniformScaling) = BlockDiag([B * U for B in blocks(M)])
+
+# Mul between BockDiag's
+Base.:*(A::BlockDiag, B::BlockDiag) = begin
+    @assert length(A.blocks) == length(B.blocks)
+    return BlockDiag([Ai * Bi for (Ai, Bi) in zip(blocks(A), blocks(B))])
+end
+Base.:*(A::Adjoint{T,<:BlockDiag}, B::BlockDiag) where {T} = begin
+    @assert length(A.parent.blocks) == length(B.blocks)
+    return BlockDiag([Ai' * Bi for (Ai, Bi) in zip(blocks(A.parent), blocks(B))])
+end
+Base.:*(A::BlockDiag, B::Adjoint{T,<:BlockDiag}) where {T} = begin
+    @assert length(A.blocks) == length(B.parent.blocks)
+    return BlockDiag([Ai * Bi' for (Ai, Bi) in zip(blocks(A), blocks(B.parent))])
+end
+
+# Standard LinearAlgebra.mul!
+for _mul! in (:mul!, :_matmul!)
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::BlockDiag) = begin
+        @assert length(C.blocks) == length(A.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], B.blocks[i])
+        end
+        return C
+    end
+    (_mul! == :_matmul!) && @eval $_mul!(
+        C::BlockDiag{T},
+        A::BlockDiag{T},
+        B::BlockDiag{T},
+    ) where {T<:LinearAlgebra.BlasFloat} = begin
+        @assert length(C.blocks) == length(A.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], B.blocks[i])
+        end
+        return C
+    end
+
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::BlockDiag, alpha::Number, beta::Number) =
+        begin
+            @assert length(C.blocks) == length(A.blocks) == length(B.blocks)
+            @simd ivdep for i in eachindex(blocks(C))
+                @inbounds $_mul!(C.blocks[i], A.blocks[i], B.blocks[i], alpha, beta)
+            end
+            return C
+        end
+    (_mul! == :_matmul!) && @eval $_mul!(
+        C::BlockDiag{T},
+        A::BlockDiag{T},
+        B::BlockDiag{T},
+        alpha::Number,
+        beta::Number,
+    ) where {T<:LinearAlgebra.BlasFloat} = begin
+        @assert length(C.blocks) == length(A.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], B.blocks[i], alpha, beta)
+        end
+        return C
+    end
+
+    @eval $_mul!(C::BlockDiag, A::Adjoint{<:Number,<:BlockDiag}, B::BlockDiag) = begin
+        @assert length(C.blocks) == length(A.parent.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], adjoint(A.parent.blocks[i]), B.blocks[i])
+        end
+        return C
+    end
+    (_mul! == :_matmul!) && @eval $_mul!(
+        C::BlockDiag{T},
+        A::BlockDiag{T},
+        B::Adjoint{T,<:BlockDiag{T}},
+    ) where {T<:LinearAlgebra.BlasFloat} = begin
+        @assert length(C.blocks) == length(A.blocks) == length(B.parent.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], adjoint(B.parent.blocks[i]))
+        end
+        return C
+    end
+
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::Adjoint{<:Number,<:BlockDiag}) = begin
+        @assert length(C.blocks) == length(A.blocks) == length(B.parent.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], adjoint(B.parent.blocks[i]))
+        end
+        return C
+    end
+    (_mul! == :_matmul!) && @eval $_mul!(
+        C::BlockDiag{T},
+        A::Adjoint{T,<:BlockDiag{T}},
+        B::BlockDiag{T},
+    ) where {T<:LinearAlgebra.BlasFloat} = begin
+        @assert length(C.blocks) == length(A.parent.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], adjoint(A.parent.blocks[i]), B.blocks[i])
+        end
+        return C
+    end
+
+    @eval $_mul!(C::BlockDiag, A::Number, B::BlockDiag) = begin
+        @assert length(C.blocks) == length(B.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A, B.blocks[i])
+        end
+        return C
+    end
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::Number) = begin
+        @assert length(C.blocks) == length(A.blocks)
+        @simd ivdep for i in eachindex(blocks(C))
+            @inbounds $_mul!(C.blocks[i], A.blocks[i], B)
+        end
+        return C
+    end
+
+    @eval $_mul!(
+        C::AbstractVector,
+        A::BlockDiag,
+        B::AbstractVector,
+    ) = begin
+        @assert size(A, 2) == length(B)
+        @assert length(C) == size(A, 1)
+        ic, ib = 1, 1
+        for i in eachindex(blocks(A))
+            d1, d2 = size(A.blocks[i])
+            @inbounds $_mul!(view(C, ic:(ic+d1-1)), A.blocks[i], view(B, ib:(ib+d2-1)))
+            ic += d1
+            ib += d2
+        end
+        return C
+    end
+    (_mul! == :_matmul!) && @eval $_mul!(
+        C::AbstractVector{T},
+        A::BlockDiag{T},
+        B::AbstractVector{T},
+    ) where {T<:LinearAlgebra.BlasFloat} = begin
+        @assert size(A, 2) == length(B)
+        @assert length(C) == size(A, 1)
+        ic, ib = 1, 1
+        for i in eachindex(blocks(A))
+            d1, d2 = size(A.blocks[i])
+            @inbounds $_mul!(view(C, ic:(ic+d1-1)), A.blocks[i], view(B, ib:(ib+d2-1)))
+            ic += d1
+            ib += d2
+        end
+        return C
+    end
+end
+
+LinearAlgebra.rmul!(B::BlockDiag, n::Number) = begin
+    @simd ivdep for i in eachindex(B.blocks)
+        rmul!(B.blocks[i], n)
+    end
+    return B
+end
+
+LinearAlgebra.inv(A::BlockDiag) = BlockDiag(inv.(blocks(A)))
+
+copy!(A::BlockDiag, B::Diagonal) = begin
+    @assert size(A) == size(B)
+    i = 1
+    for Ai in blocks(A)
+        d = LinearAlgebra.checksquare(Ai)
+        @views copy!(Ai, Diagonal(B.diag[i:i+d-1]))
+        i += d
+    end
+    return A
+end
+
+Base.:*(D::Diagonal, A::BlockDiag) = begin
+    @assert size(D, 2) == size(A, 1)
+    local i = 1
+    outblocks = map(blocks(A)) do Ai
+        d = size(Ai, 1)
+        outi = Diagonal(view(D.diag, i:(i+d-1))) * Ai
+        i += d
+        outi
+    end
+    return BlockDiag(outblocks)
+end
+Base.:*(A::BlockDiag, D::Diagonal) = begin
+    local i = 1
+    outblocks = map(blocks(A)) do Ai
+        d = size(Ai, 2)
+        outi = Ai * Diagonal(view(D.diag, i:(i+d-1)))
+        i += d
+        outi
+    end
+    return BlockDiag(outblocks)
+end
+for _mul! in (:mul!, :_matmul!)
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::Diagonal) = begin
+        local i = 1
+        @assert nblocks(C) == nblocks(A)
+        for j in eachindex(blocks(C))
+            Ci, Ai = blocks(C)[j], blocks(A)[j]
+            d = size(Ai, 2)
+            $_mul!(Ci, Ai, Diagonal(view(B.diag, i:(i+d-1))))
+            i += d
+        end
+        return C
+    end
+    @eval $_mul!(C::BlockDiag, A::Diagonal, B::BlockDiag) = begin
+        local i = 1
+        @assert nblocks(C) == nblocks(B)
+        for j in eachindex(blocks(C))
+            Ci, Bi = blocks(C)[j], blocks(B)[j]
+            d = size(Bi, 1)
+            $_mul!(Ci, Diagonal(view(A.diag, i:(i+d-1))), Bi)
+            i += d
+        end
+        return C
+    end
+    @eval $_mul!(C::BlockDiag, A::BlockDiag, B::Diagonal, alpha::Number, beta::Number) =
+        begin
+            local i = 1
+            @assert nblocks(C) == nblocks(A)
+            for j in eachindex(blocks(C))
+                Ci, Ai = blocks(C)[j], blocks(A)[j]
+                d = size(Ai, 2)
+                $_mul!(Ci, Ai, Diagonal(view(B.diag, i:(i+d-1))), alpha, beta)
+                i += d
+            end
+            return C
+        end
+    @eval $_mul!(C::BlockDiag, A::Diagonal, B::BlockDiag, alpha::Number, beta::Number) =
+        begin
+            i = 1
+            @assert nblocks(C) == nblocks(B)
+            for j in eachindex(blocks(C))
+                Ci, Bi = blocks(C)[j], blocks(B)[j]
+                d = size(Bi, 1)
+                @inbounds $_mul!(Ci, Diagonal(view(A.diag, i:(i+d-1))), Bi, alpha, beta)
+                i += d
+            end
+            return C
+        end
+end
+
+Base.isequal(A::BlockDiag, B::BlockDiag) =
+    length(A.blocks) == length(B.blocks) && all(map(isequal, A.blocks, B.blocks))
+==(A::BlockDiag, B::BlockDiag) =
+    length(A.blocks) == length(B.blocks) && all(map(==, A.blocks, B.blocks))
diff --git a/src/caches.jl b/src/caches.jl
index da8e191cd..ab28e0657 100644
--- a/src/caches.jl
+++ b/src/caches.jl
@@ -5,7 +5,7 @@ mutable struct EKCache{
     RType,CFacType,ProjType,SolProjType,PType,PIType,EType,uType,duType,xType,PriorType,
     AType,QType,
     FType,LType,FHGMethodType,FHGCacheType,
-    HType,vecType,matType,bkType,diffusionType,diffModelType,measModType,measType,
+    HType,vecType,dduType,matType,bkType,diffusionType,diffModelType,measModType,measType,
     puType,llType,dtType,rateType,UF,JC,uNoUnitsType,
 } <: AbstractODEFilterCache
     # Constants
@@ -49,7 +49,7 @@ mutable struct EKCache{
     pu_tmp::puType
     H::HType
     du::duType
-    ddu::matType
+    ddu::dduType
     K1::matType
     G1::matType
     Smat::HType
@@ -103,7 +103,7 @@ function OrdinaryDiffEq.alg_cache(
     # uElType = eltype(u_vec)
     uElType = uBottomEltypeNoUnits
 
-    FAC = get_covariance_structure(alg; elType=uElType, d, q)
+    FAC = alg.covariance_factorization{uElType}(d, q)
     if FAC isa IsometricKroneckerCovariance && !(f.mass_matrix isa UniformScaling)
         throw(
             ArgumentError(
@@ -165,7 +165,7 @@ function OrdinaryDiffEq.alg_cache(
     # Diffusion Model
     diffmodel = alg.diffusionmodel
     initdiff = initial_diffusion(diffmodel, d, q, uEltypeNoUnits)
-    copy!(x0.Σ, apply_diffusion(x0.Σ, initdiff))
+    apply_diffusion!(x0.Σ, initdiff)
 
     # Measurement model related things
     R =
@@ -178,22 +178,23 @@ function OrdinaryDiffEq.alg_cache(
 
     # Caches
     du = is_secondorder_ode ? similar(u.x[2]) : similar(u)
-    ddu = factorized_similar(FAC, length(u), length(u))
+    ddu =
+        !isnothing(f.jac_prototype) ?
+        f.jac_prototype : zeros(uElType, length(u), length(u))
     _d = is_secondorder_ode ? 2d : d
-    pu_tmp = Gaussian(
-        similar(Array{uElType}, _d),
-        PSDMatrix(
-            if FAC isa IsometricKroneckerCovariance
-                if is_secondorder_ode
+    pu_tmp = if is_secondorder_ode
+        Gaussian(similar(Array{uElType}, 2d),
+            PSDMatrix(
+                if FAC isa IsometricKroneckerCovariance
                     Kronecker.kronecker(similar(Matrix{uElType}, D ÷ d, _d ÷ d), I(d))
+                elseif FAC isa BlockDiagonalCovariance
+                    error("I have no idea")
                 else
-                    factorized_similar(FAC, D, d)
-                end
-            else
-                similar(Matrix{uElType}, D, _d)
-            end,
-        ),
-    )
+                    similar(Matrix{uElType}, D, _d)
+                end))
+    else
+        Gaussian(similar(Array{uElType}, d), PSDMatrix(factorized_similar(FAC, D, d)))
+    end
 
     K = factorized_similar(FAC, D, d)
     G = factorized_similar(FAC, D, D)
@@ -240,7 +241,7 @@ function OrdinaryDiffEq.alg_cache(
         typeof(R),typeof(FAC),typeof(Proj),typeof(SolProj),typeof(P),typeof(PI),typeof(E0),
         uType,typeof(du),typeof(x0),typeof(prior),typeof(A),typeof(Q),
         typeof(F),typeof(L),typeof(FHG_method),typeof(FHG_cache),
-        typeof(H),typeof(C_d),matType,
+        typeof(H),typeof(C_d),typeof(ddu),matType,
         typeof(backward_kernel),typeof(initdiff),
         typeof(diffmodel),typeof(measurement_model),typeof(measurement),typeof(pu_tmp),
         uEltypeNoUnits,typeof(dt),typeof(du1),typeof(uf),typeof(jac_config),typeof(atmp),
diff --git a/src/callbacks/dataupdate.jl b/src/callbacks/dataupdate.jl
index c14223cab..def5759ac 100644
--- a/src/callbacks/dataupdate.jl
+++ b/src/callbacks/dataupdate.jl
@@ -50,33 +50,30 @@ function DataUpdateCallback(
         val = values[idx]
 
         o = length(val)
+        d = integ.cache.d
 
         @unpack x, E0, m_tmp, G1 = integ.cache
-        H = view(G1, 1:o, :)
-        if observation_matrix === I
-            @.. H = E0
-        elseif observation_matrix isa UniformScaling
-            @.. H = observation_matrix.λ * E0
-        else
-            matmul!(H, observation_matrix, E0)
-        end
+        M = observation_matrix
+        H = M * E0
 
         obs_mean = _matmul!(view(m_tmp.μ, 1:o), H, x.μ)
         obs_mean .-= val
 
         R = cov2psdmatrix(observation_noise_cov; d=o)
+        R = to_factorized_matrix(integ.cache.covariance_factorization, R)
 
         # _A = x.Σ.R * H'
         # obs_cov = _A'_A + R
-        obs_cov = PSDMatrix(qr!([x.Σ.R * H'; R.R]).R)
+        obs_cov = PSDMatrix(make_obscov_sqrt(x.Σ.R, H, R.R))
+
         obs = Gaussian(obs_mean, obs_cov)
 
-        @unpack x_tmp, K1, C_DxD, C_dxd, C_Dxd, C_d = integ.cache
-        K1 = view(K1, :, 1:o)
-        C_dxd = view(C_dxd, 1:o, 1:o)
-        C_Dxd = view(C_Dxd, :, 1:o)
-        C_d = view(C_d, 1:o)
-        _x = copy!(x_tmp, x)
+        if o != d && !(integ.alg isa EK1)
+            error("Partial observations only work with the EK1 right now")
+        end
+        _cache = make_obssized_cache(integ.cache; o)
+        @unpack K1, C_DxD, C_dxd, C_Dxd, C_d = _cache
+        _x = copy!(integ.cache.x_tmp, x)
         _, ll = update!(x, _x, obs, H, K1, C_Dxd, C_DxD, C_dxd, C_d; R=R)
 
         if !isnothing(loglikelihood)
@@ -85,3 +82,37 @@ function DataUpdateCallback(
     end
     return PresetTimeCallback(data.t, affect!; save_positions, kwargs...)
 end
+
+make_obscov_sqrt(PR::AbstractMatrix, H::AbstractMatrix, RR::AbstractMatrix) =
+    qr!([PR * H'; RR]).R
+make_obscov_sqrt(
+    PR::IsometricKroneckerProduct,
+    H::IsometricKroneckerProduct,
+    RR::IsometricKroneckerProduct,
+) =
+    IsometricKroneckerProduct(PR.ldim, make_obscov_sqrt(PR.B, H.B, RR.B))
+make_obscov_sqrt(PR::BlockDiag, H::BlockDiag, RR::BlockDiag) =
+    BlockDiag([
+        make_obscov_sqrt(blocks(PR)[i], blocks(H)[i], blocks(RR)[i]) for
+        i in eachindex(blocks(PR))
+    ])
+
+function make_obssized_cache(cache; o)
+    if o == cache.d
+        return cache
+    else
+        return make_obssized_cache(cache.covariance_factorization, cache; o)
+    end
+end
+function make_obssized_cache(::DenseCovariance, cache; o)
+    @unpack K1, C_DxD, C_dxd, C_Dxd, C_d, m_tmp, x_tmp = cache
+    return (
+        K1=view(K1, :, 1:o),
+        C_dxd=view(C_dxd, 1:o, 1:o),
+        C_Dxd=view(C_Dxd, :, 1:o),
+        C_d=view(C_d, 1:o),
+        C_DxD=C_DxD,
+        m_tmp=Gaussian(view(m_tmp.μ, 1:o), view(m_tmp.Σ, 1:o, 1:o)),
+        x_tmp=x_tmp,
+    )
+end
diff --git a/src/checks.jl b/src/checks.jl
index 6fb213b08..819ebe454 100644
--- a/src/checks.jl
+++ b/src/checks.jl
@@ -16,7 +16,7 @@ function check_densesmooth(integ)
         error("To use `dense=true` you need to set `smooth=true`!")
     end
     if !integ.opts.save_everystep && integ.alg.smooth
-        error("If you do not save all values, you do not need to smooth!")
+        error("If you set `save_everystep=false` also set `smooth=false` in the alg!")
     end
 end
 function check_saveiter(integ)
diff --git a/src/covariance_structure.jl b/src/covariance_structure.jl
index 7390b5124..2809a3e42 100644
--- a/src/covariance_structure.jl
+++ b/src/covariance_structure.jl
@@ -7,20 +7,9 @@ struct DenseCovariance{T} <: CovarianceStructure{T}
     d::Int64
     q::Int64
 end
-
-function get_covariance_structure(alg; elType, d, q)
-    if (
-        alg isa EK0 &&
-        !(
-            alg.diffusionmodel isa DynamicMVDiffusion ||
-            alg.diffusionmodel isa FixedMVDiffusion
-        ) &&
-        alg.prior isa IWP
-    )
-        return IsometricKroneckerCovariance{elType}(d, q)
-    else
-        return DenseCovariance{elType}(d, q)
-    end
+struct BlockDiagonalCovariance{T} <: CovarianceStructure{T}
+    d::Int64
+    q::Int64
 end
 
 factorized_zeros(C::IsometricKroneckerCovariance{T}, sizes...) where {T} = begin
@@ -41,9 +30,34 @@ factorized_zeros(::DenseCovariance{T}, sizes...) where {T} =
 factorized_similar(::DenseCovariance{T}, size1, size2) where {T} =
     similar(Matrix{T}, size1, size2)
 
+factorized_zeros(C::BlockDiagonalCovariance{T}, sizes...) where {T} = begin
+    for s in sizes
+        @assert s % C.d == 0
+    end
+    return BlockDiag([Array{T}(calloc, (s ÷ C.d for s in sizes)...) for _ in 1:C.d])
+end
+factorized_similar(C::BlockDiagonalCovariance{T}, size1, size2) where {T} = begin
+    for s in (size1, size2)
+        @assert s % C.d == 0
+    end
+    return BlockDiag([similar(Matrix{T}, size1 ÷ C.d, size2 ÷ C.d) for _ in 1:C.d])
+end
+
 to_factorized_matrix(::DenseCovariance, M::AbstractMatrix) = Matrix(M)
 to_factorized_matrix(::IsometricKroneckerCovariance, M::IsometricKroneckerProduct) = M
-for FT in [:DenseCovariance, :IsometricKroneckerCovariance]
+to_factorized_matrix(C::BlockDiagonalCovariance, M::IsometricKroneckerProduct) =
+    BlockDiag([copy(M.B) for _ in 1:C.d])
+to_factorized_matrix(C::BlockDiagonalCovariance, M::Diagonal) =
+    copy!(factorized_similar(C, size(M)...), M)
+to_factorized_matrix(
+    C::IsometricKroneckerCovariance, M::Diagonal{<:Number,<:FillArrays.Fill}) = begin
+    out = factorized_similar(C, size(M)...)
+    @assert length(out.B) == 1
+    out.B .= M.diag.value
+    out
+end
+
+for FT in [:DenseCovariance, :IsometricKroneckerCovariance, :BlockDiagonalCovariance]
     @eval to_factorized_matrix(FAC::$FT, M::PSDMatrix) =
         PSDMatrix(to_factorized_matrix(FAC, M.R))
 end
diff --git a/src/data_likelihoods/fenrir.jl b/src/data_likelihoods/fenrir.jl
index a805f9de9..01e3a3746 100644
--- a/src/data_likelihoods/fenrir.jl
+++ b/src/data_likelihoods/fenrir.jl
@@ -57,6 +57,7 @@ function fenrir_data_loglik(
     # Fit the ODE solution / PN posterior to the provided data; this is the actual Fenrir
     o = length(data.u[1])
     R = cov2psdmatrix(observation_noise_cov; d=o)
+    R = to_factorized_matrix(integ.cache.covariance_factorization, R)
     LL, _, _ = fit_pnsolution_to_data!(sol, R, data; proj=observation_matrix)
 
     return LL
@@ -74,17 +75,10 @@ function fit_pnsolution_to_data!(
     LL = zero(eltype(sol.prob.p))
 
     o = length(data.u[1])
-    @unpack x_tmp, C_dxd, C_d, K1, C_Dxd, C_DxD, m_tmp = cache
-    _cache = (
-        x_tmp=x_tmp,
-        C_DxD=C_DxD,
-        C_Dxd=view(C_Dxd, :, 1:o),
-        C_dxd=view(C_dxd, 1:o, 1:o),
-        C_d=view(C_d, 1:o),
-        K1=view(K1, :, 1:o),
-        K2=view(C_Dxd, :, 1:o),
-        m_tmp=Gaussian(view(m_tmp.μ, 1:o), view(m_tmp.Σ, 1:o, 1:o)),
-    )
+    d = cache.d
+    @unpack x_tmp, m_tmp = cache
+    _cache = make_obssized_cache(cache; o)
+    @unpack K1, C_DxD, C_dxd, C_Dxd, C_d = _cache
 
     x_posterior = copy(sol.x_filt) # the object to be filled
     state2data_projmat = proj * cache.SolProj
@@ -136,9 +130,7 @@ function measure_and_update!(x, u, H, R::PSDMatrix, cache)
     z, S = cache.m_tmp
     _matmul!(z, H, x.μ)
     z .-= u
-    _matmul!(cache.C_Dxd, x.Σ.R, H')
-    _matmul!(S, cache.C_Dxd', cache.C_Dxd)
-    S .+= _matmul!(cache.C_dxd, R.R', R.R)
+    S = PSDMatrix(make_obscov_sqrt(x.Σ.R, H, R.R))
     msmnt = Gaussian(z, S)
 
     return update!(x, copy!(cache.x_tmp, x), msmnt, H; R=R, cache)
diff --git a/src/derivative_utils.jl b/src/derivative_utils.jl
index 9b84fb1c8..bbfae4482 100644
--- a/src/derivative_utils.jl
+++ b/src/derivative_utils.jl
@@ -8,7 +8,15 @@ function calc_H!(H, integ, cache)
         calc_H_EK0!(H, integ, cache)
         # @assert integ.u == @view x_pred.μ[1:(q+1):end]
         OrdinaryDiffEq.calc_J!(ddu, integ, cache, true)
-        ProbNumDiffEq._matmul!(H, view(ddu, 1:d, :), cache.SolProj, -1.0, 1.0)
+        _ddu = size(ddu, 2) != d ? view(ddu, 1:d, :) : ddu
+        _matmul!(H, _ddu, cache.SolProj, -1.0, 1.0)
+    elseif integ.alg isa DiagonalEK1
+        calc_H_EK0!(H, integ, cache)
+        OrdinaryDiffEq.calc_J!(ddu, integ, cache, true)
+        ddu_diag = Diagonal(ddu)
+        _matmul!(H, ddu_diag, cache.SolProj, -1.0, 1.0)
+    else
+        error("Unknown algorithm")
     end
     return nothing
 end
diff --git a/src/diffusions.jl b/src/diffusions.jl
deleted file mode 100644
index 3570a22f7..000000000
--- a/src/diffusions.jl
+++ /dev/null
@@ -1,213 +0,0 @@
-abstract type AbstractDiffusion end
-abstract type AbstractStaticDiffusion <: AbstractDiffusion end
-abstract type AbstractDynamicDiffusion <: AbstractDiffusion end
-isstatic(diffusion::AbstractStaticDiffusion) = true
-isdynamic(diffusion::AbstractStaticDiffusion) = false
-isstatic(diffusion::AbstractDynamicDiffusion) = false
-isdynamic(diffusion::AbstractDynamicDiffusion) = true
-
-apply_diffusion(Q::PSDMatrix, diffusion::Diagonal) = X_A_Xt(Q, sqrt.(diffusion))
-apply_diffusion(Q::PSDMatrix, diffusion::Number) = PSDMatrix(Q.R * sqrt.(diffusion))
-
-estimate_global_diffusion(diffusion::AbstractDynamicDiffusion, d, q, Eltype) = NaN
-
-"""
-    DynamicDiffusion()
-
-Time-varying, isotropic diffusion, which is quasi-maximum-likelihood-estimated at each step.
-
-**This is the recommended diffusion when using adaptive step-size selection,** and in
-particular also when solving stiff systems.
-"""
-struct DynamicDiffusion <: AbstractDynamicDiffusion end
-initial_diffusion(::DynamicDiffusion, d, q, Eltype) = one(Eltype)
-estimate_local_diffusion(::DynamicDiffusion, integ) = local_scalar_diffusion(integ.cache)
-
-"""
-    DynamicMVDiffusion()
-
-Time-varying, diagonal diffusion, which is quasi-maximum-likelihood-estimated at each step.
-
-**Only works with the [`EK0`](@ref)!**
-
-A multi-variate version of [`DynamicDiffusion`](@ref), where instead of an isotropic matrix,
-a diagonal matrix is estimated. This can be helpful to get more expressive posterior
-covariances when using the [`EK0`](@ref), since the individual dimensions can be adjusted
-separately.
-
-# References
-* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
-"""
-struct DynamicMVDiffusion <: AbstractDynamicDiffusion end
-initial_diffusion(::DynamicMVDiffusion, d, q, Eltype) =
-    kron(Diagonal(ones(Eltype, d)), Diagonal(ones(Eltype, q + 1)))
-estimate_local_diffusion(::DynamicMVDiffusion, integ) =
-    local_diagonal_diffusion(integ.cache)
-
-"""
-    FixedDiffusion(; initial_diffusion=1.0, calibrate=true)
-
-Time-fixed, isotropic diffusion, which is (optionally) quasi-maximum-likelihood-estimated.
-
-**This is the recommended diffusion when using fixed steps.**
-
-By default with `calibrate=true`, all covariances are re-scaled at the end of the solve
-with the MLE diffusion. Set `calibrate=false` to skip this step, e.g. when setting the
-`initial_diffusion` and then estimating the diffusion outside of the solver
-(e.g. with [Fenrir.jl](https://github.com/nathanaelbosch/Fenrir.jl)).
-"""
-Base.@kwdef struct FixedDiffusion{T<:Number} <: AbstractStaticDiffusion
-    initial_diffusion::T = 1.0
-    calibrate::Bool = true
-end
-initial_diffusion(diffusionmodel::FixedDiffusion, d, q, Eltype) =
-    diffusionmodel.initial_diffusion * one(Eltype)
-estimate_local_diffusion(::FixedDiffusion, integ) = local_scalar_diffusion(integ.cache)
-function estimate_global_diffusion(::FixedDiffusion, integ)
-    @unpack d, measurement, m_tmp, Smat = integ.cache
-    # sol_diffusions = integ.sol.diffusions
-
-    v, S = measurement.μ, measurement.Σ
-    e = m_tmp.μ
-    e .= v
-    diffusion_t = if S isa IsometricKroneckerProduct
-        @assert length(S.B) == 1
-        dot(v, e) / d / S.B[1]
-    else
-        S_chol = cholesky!(S)
-        ldiv!(S_chol, e)
-        dot(v, e) / d
-    end
-
-    if integ.success_iter == 0
-        # @assert length(sol_diffusions) == 0
-        global_diffusion = diffusion_t
-        return global_diffusion
-    else
-        # @assert length(sol_diffusions) == integ.success_iter
-        diffusion_prev = integ.cache.global_diffusion
-        global_diffusion =
-            diffusion_prev + (diffusion_t - diffusion_prev) / integ.success_iter
-        # @info "compute diffusion" diffusion_prev global_diffusion
-        return global_diffusion
-    end
-end
-
-"""
-    FixedMVDiffusion(; initial_diffusion=1.0, calibrate=true)
-
-Time-fixed, diagonal diffusion, which is quasi-maximum-likelihood-estimated at each step.
-
-**Only works with the [`EK0`](@ref)!**
-
-A multi-variate version of [`FixedDiffusion`](@ref), where instead of an isotropic matrix,
-a diagonal matrix is estimated. This can be helpful to get more expressive posterior
-covariances when using the [`EK0`](@ref), since the individual dimensions can be adjusted
-separately.
-
-# References
-* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
-"""
-Base.@kwdef struct FixedMVDiffusion{T} <: AbstractStaticDiffusion
-    initial_diffusion::T = 1.0
-    calibrate::Bool = true
-end
-function initial_diffusion(diffusionmodel::FixedMVDiffusion, d, q, Eltype)
-    initdiff = diffusionmodel.initial_diffusion
-    @assert initdiff isa Number || length(initdiff) == d
-    return kron(Diagonal(initdiff .* ones(Eltype, d)), Diagonal(ones(Eltype, q + 1)))
-end
-estimate_local_diffusion(::FixedMVDiffusion, integ) = local_diagonal_diffusion(integ.cache)
-function estimate_global_diffusion(::FixedMVDiffusion, integ)
-    @unpack d, q, measurement, local_diffusion = integ.cache
-
-    v, S = measurement.μ, measurement.Σ
-    # @assert diag(S) |> unique |> length == 1
-    S_11 = S[1, 1]
-
-    Σ_ii = v .^ 2 ./ S_11
-    Σ = Diagonal(Σ_ii)
-    Σ_out = kron(Σ, I(q + 1)) # -> Different for each dimension; same for each derivative
-
-    if integ.success_iter == 0
-        # @assert length(diffusions) == 0
-        return Σ_out
-    else
-        # @assert length(diffusions) == integ.success_iter
-        diffusion_prev = integ.cache.global_diffusion
-        diffusion =
-            @. diffusion_prev =
-                diffusion_prev + (Σ_out - diffusion_prev) / integ.success_iter
-        return diffusion
-    end
-end
-
-"""
-    local_scalar_diffusion(integ)
-
-Compute the local, scalar diffusion estimate.
-
-Corresponds to
-```math
-σ² = zᵀ (H Q H^T)⁻¹ z,
-```
-where ``z, H, Q`` are taken from the passed integrator.
-
-For more background information
-* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
-"""
-function local_scalar_diffusion(cache)
-    @unpack d, R, H, Qh, measurement, m_tmp, Smat, C_Dxd = cache
-    z = measurement.μ
-    e, HQH = m_tmp.μ, m_tmp.Σ
-    _matmul!(C_Dxd, Qh.R, H')
-    _matmul!(HQH, C_Dxd', C_Dxd)
-    e .= z
-    σ² = if HQH isa IsometricKroneckerProduct
-        @assert length(HQH.B) == 1
-        dot(z, e) / d / HQH.B[1]
-    else
-        C = cholesky!(HQH)
-        ldiv!(C, e)
-        dot(z, e) / d
-    end
-    return σ²
-end
-
-"""
-    local_diagonal_diffusion(cache)
-
-Compute the local, scalar diffusion estimate.
-
-Corresponds to
-```math
-Σ_{ii} = z_i^2 / (H Q H^T)_{ii},
-```
-where ``z, H, Q`` are taken from the passed integrator.
-**This should only be used with the EK0!**
-
-For more background information
-* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
-"""
-function local_diagonal_diffusion(cache)
-    @unpack d, q, H, Qh, measurement, m_tmp, tmp = cache
-    @unpack local_diffusion = cache
-    z = measurement.μ
-    # HQH = H * unfactorize(Qh) * H'
-    # @assert HQH |> diag |> unique |> length == 1
-    # c1 = view(_matmul!(cache.C_Dxd, Qh.R, H'), :, 1)
-    c1 = mul!(view(cache.C_Dxd, :, 1:1), Qh.R, view(H, 1:1, :)')
-    Q0_11 = dot(c1, c1)
-
-    Σ_ii = @. m_tmp.μ = z^2 / Q0_11
-    Σ = Diagonal(Σ_ii)
-
-    # local_diffusion = kron(Σ, I(q+1))
-    # -> Different for each dimension; same for each derivative
-    for i in 1:d
-        for j in (i-1)*(q+1)+1:i*(q+1)
-            local_diffusion[j, j] = Σ[i, i]
-        end
-    end
-    return local_diffusion
-end
diff --git a/src/diffusions/apply_diffusion.jl b/src/diffusions/apply_diffusion.jl
new file mode 100644
index 000000000..9638ba587
--- /dev/null
+++ b/src/diffusions/apply_diffusion.jl
@@ -0,0 +1,102 @@
+"""
+    apply_diffusion(Q::PSDMatrix, diffusion::Union{Number, Diagonal}) -> PSDMatrix
+
+Apply the diffusion to the PSD transition noise covariance `Q`, return the result.
+"""
+apply_diffusion(
+    Q::PSDMatrix,
+    diffusion::Number,
+) = PSDMatrix(Q.R * sqrt.(diffusion))
+apply_diffusion(
+    Q::PSDMatrix,
+    diffusion::Diagonal{T,<:FillArrays.Fill},
+) where {T} = apply_diffusion(Q, diffusion.diag.value)
+apply_diffusion(
+    Q::PSDMatrix{T,<:BlockDiag},
+    diffusion::Diagonal{T,<:Vector},
+) where {T} = PSDMatrix(
+    BlockDiag([blocks(Q.R)[i] * sqrt.(diffusion.diag[i]) for i in eachindex(blocks(Q.R))]))
+apply_diffusion(
+    Q::PSDMatrix{T,<:Matrix},
+    diffusion::Diagonal{T,<:Vector},
+) where {T} = begin
+    d = size(diffusion, 1)
+    q = size(Q, 1) ÷ d - 1
+    return PSDMatrix(Q.R * sqrt.(Kronecker.kronecker(diffusion, Eye(q + 1))))
+end
+
+"""
+    apply_diffusion!(Q::PSDMatrix, diffusion::Union{Number, Diagonal}) -> PSDMatrix
+
+Apply the diffusion to the PSD transition noise covariance `Q` in place and return the result.
+"""
+apply_diffusion!(
+    Q::PSDMatrix,
+    diffusion::Diagonal{T,<:FillArrays.Fill},
+) where {T} = begin
+    rmul!(Q.R, sqrt.(diffusion.diag.value))
+    return Q
+end
+apply_diffusion!(
+    Q::PSDMatrix{T,<:BlockDiag},
+    diffusion::Diagonal{T,<:Vector},
+) where {T} = begin
+    @simd ivdep for i in eachindex(blocks(Q.R))
+        rmul!(blocks(Q.R)[i], sqrt(diffusion.diag[i]))
+    end
+    return Q
+end
+apply_diffusion!(
+    Q::PSDMatrix,
+    diffusion::Diagonal,
+) = begin
+    # @warn "This is not yet implemented efficiently; TODO"
+    d = size(diffusion, 1)
+    D = size(Q, 1)
+    q = D ÷ d - 1
+    # _matmul!(Q.R, Q.R, Kronecker.kronecker(sqrt.(diffusion), Eye(q + 1)))
+    _matmul!(Q.R, Q.R, kron(sqrt.(diffusion), Eye(q + 1)))
+    return Q
+end
+
+"""
+    apply_diffusion!(out::PSDMatrix, Q::PSDMatrix, diffusion::Union{Number, Diagonal}) -> PSDMatrix
+
+Apply the diffusion to the PSD transition noise covariance `Q` and store the result in `out`.
+"""
+apply_diffusion!(
+    out::PSDMatrix,
+    Q::PSDMatrix,
+    diffusion::Number,
+) = begin
+    _matmul!(out.R, Q.R, sqrt.(diffusion))
+    return out
+end
+apply_diffusion!(
+    out::PSDMatrix,
+    Q::PSDMatrix,
+    diffusion::Diagonal{<:Number,<:FillArrays.Fill},
+) = apply_diffusion!(out, Q, diffusion.diag.value)
+apply_diffusion!(
+    out::PSDMatrix{T,<:BlockDiag},
+    Q::PSDMatrix{T,<:BlockDiag},
+    diffusion::Diagonal{<:T,<:Vector},
+) where {T} = begin
+    @simd ivdep for i in eachindex(blocks(Q.R))
+        _matmul!(blocks(out.R)[i], blocks(Q.R)[i], sqrt(diffusion.diag[i]))
+    end
+    return out
+end
+apply_diffusion!(
+    out::PSDMatrix,
+    Q::PSDMatrix,
+    diffusion::Diagonal,
+) = begin
+    # @warn "This is not yet implemented efficiently; TODO"
+    d = size(diffusion, 1)
+    D = size(Q, 1)
+    q = D ÷ d - 1
+    # _matmul!(out.R, Q.R, Kronecker.kronecker(sqrt.(diffusion), Eye(q + 1)))
+    _matmul!(out.R, Q.R, kron(sqrt.(diffusion), Eye(q + 1)))
+    return out
+end
diff --git a/src/diffusions/calibration.jl b/src/diffusions/calibration.jl
new file mode 100644
index 000000000..6b22b3da3
--- /dev/null
+++ b/src/diffusions/calibration.jl
@@ -0,0 +1,178 @@
+@doc raw"""
+   invquad(v, M; v_cache, M_cache)
+
+Compute ``v' M^{-1} v`` without allocations and with Matrix-specific specializations.
+
+Needed for MLE diffusion estimation.
+"""
+invquad
+invquad(v, M::Matrix; v_cache, M_cache) = begin
+    v_cache .= v
+    M_chol = cholesky!(copy!(M_cache, M))
+    ldiv!(M_chol, v_cache)
+    dot(v, v_cache)
+end
+invquad(v, M::IsometricKroneckerProduct; v_cache, M_cache=nothing) = begin
+    v_cache .= v
+    @assert length(M.B) == 1
+    return dot(v, v_cache) / M.B[1]
+end
+invquad(v, M::BlockDiag; v_cache, M_cache=nothing) = begin
+    v_cache .= v
+    @assert length(M.blocks) == length(v) == length(v_cache)
+    @simd ivdep for i in eachindex(v)
+        @assert length(M.blocks[i]) == 1
+        @inbounds v_cache[i] /= M.blocks[i][1]
+    end
+    return dot(v, v_cache)
+end
+
+@doc raw"""
+    estimate_global_diffusion(::FixedDiffusion, integ)
+
+Updates the global quasi-MLE diffusion estimate on the current measuremnt.
+
+The global quasi-MLE diffusion estimate Corresponds to
+```math
+\hat{σ}^2_N = \frac{1}{Nd} \sum_{i=1}^N z_i^T S_i^{-1} z_i,
+```
+where ``z_i, S_i`` are taken the predicted observations from each step.
+This function updates the iteratively computed global diffusion estimate by computing
+```math
+\hat{σ}^2_n = \hat{σ}^2_{n-1} + ((z_n^T S_n^{-1} z_n) / d - \hat{σ}^2_{n-1}) / n.
+```
+
+For more background information
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+function estimate_global_diffusion(::FixedDiffusion, integ)
+    @unpack d, measurement, m_tmp, Smat = integ.cache
+    v, S = measurement.μ, measurement.Σ
+    _v, _S = m_tmp.μ, m_tmp.Σ
+
+    diffusion_increment = invquad(v, S; v_cache=_v, M_cache=_S) / d
+
+    new_mle_diffusion = if integ.success_iter == 0
+        diffusion_increment
+    else
+        current_mle_diffusion = integ.cache.global_diffusion.diag.value
+        current_mle_diffusion +
+        (diffusion_increment - current_mle_diffusion) / integ.success_iter
+    end
+
+    integ.cache.global_diffusion = new_mle_diffusion * Eye(d)
+    return integ.cache.global_diffusion
+end
+
+@doc raw"""
+    estimate_global_diffusion(::FixedMVDiffusion, integ)
+
+Updates the multivariate global quasi-MLE diffusion estimate on the current measuremnt.
+
+**This only works with the EK0!**
+
+The global quasi-MLE diffusion estimate Corresponds to
+```math
+[\hat{Σ}^2_N]_{jj} = \frac{1}{N} \sum_{i=1}^N [z_i]_j^2 / [S_i]_{11},
+```
+where ``z_i, S_i`` are taken the predicted observations from each step.
+This function updates the iteratively computed global diffusion estimate by computing
+```math
+[\hat{Σ}^2_n]_{jj} = [\hat{Σ}^2_{n-1}]_{jj} + ([z_n]_j^2 / [S_n]_{11}, - [\hat{Σ}^2_{n-1}]_{jj}) / n.
+```
+
+For more background information
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+function estimate_global_diffusion(::FixedMVDiffusion, integ)
+    @unpack d, q, measurement, local_diffusion, C_d = integ.cache
+    v, S = measurement.μ, measurement.Σ
+    # @assert diag(S) |> unique |> length == 1
+    diffusion_increment = let
+        @.. C_d = v^2 / S[1, 1]
+        Diagonal(C_d)
+    end
+
+    new_mle_diffusion = if integ.success_iter == 0
+        diffusion_increment
+    else
+        current_mle_diffusion = integ.cache.global_diffusion
+        @.. current_mle_diffusion +
+            (diffusion_increment - current_mle_diffusion) / integ.success_iter
+    end
+
+    copy!(integ.cache.global_diffusion, new_mle_diffusion)
+    return integ.cache.global_diffusion
+end
+
+@doc raw"""
+    local_scalar_diffusion(integ)
+
+Compute the local scalar quasi-MLE diffusion estimate.
+
+Corresponds to
+```math
+σ² = zᵀ (H Q H^T)⁻¹ z,
+```
+where ``z, H, Q`` are taken from the passed integrator.
+
+For more background information
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+function local_scalar_diffusion(cache)
+    @unpack d, R, H, Qh, measurement, m_tmp, Smat, C_Dxd, C_d, C_dxd = cache
+    z = measurement.μ
+    HQH = let
+        _matmul!(C_Dxd, Qh.R, H')
+        _matmul!(C_dxd, C_Dxd', C_Dxd)
+    end
+    σ² = invquad(z, HQH; v_cache=C_d, M_cache=C_dxd) / d
+    cache.local_diffusion = σ² * Eye(d)
+    return cache.local_diffusion
+end
+
+@doc raw"""
+    local_diagonal_diffusion(cache)
+
+Compute the local diagonal quasi-MLE diffusion estimate.
+
+**This only works with the EK0!**
+
+Corresponds to
+```math
+Σ_{ii} = z_i^2 / (H Q H^T)_{ii},
+```
+where ``z, H, Q`` are taken from the passed integrator.
+
+For more background information
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+function local_diagonal_diffusion(cache)
+    @unpack d, q, H, Qh, measurement, m_tmp, tmp = cache
+    @unpack local_diffusion = cache
+    @assert H == cache.E1
+
+    z = measurement.μ
+    # HQH = H * unfactorize(Qh) * H'
+    # @assert HQH |> diag |> unique |> length == 1
+    # c1 = view(_matmul!(cache.C_Dxd, Qh.R, H'), :, 1)
+    # Q_11 = dot(c1, c1)
+
+    Q_11 = if Qh.R isa BlockDiag
+        for i in 1:d
+            c1 = _matmul!(
+                view(cache.C_Dxd.blocks[i], :, 1:1),
+                Qh.R.blocks[i],
+                view(H.blocks[i], 1:1, :)',
+            )
+            tmp[i] = dot(c1, c1)
+        end
+        tmp
+    else
+        @warn "This is not yet implemented efficiently; TODO"
+        diag(X_A_Xt(Qh, H))
+    end
+
+    @. local_diffusion.diag = z^2 / Q_11
+    return local_diffusion
+end
diff --git a/src/diffusions/typedefs.jl b/src/diffusions/typedefs.jl
new file mode 100644
index 000000000..1ebfb1954
--- /dev/null
+++ b/src/diffusions/typedefs.jl
@@ -0,0 +1,103 @@
+abstract type AbstractDiffusion end
+abstract type AbstractStaticDiffusion <: AbstractDiffusion end
+abstract type AbstractDynamicDiffusion <: AbstractDiffusion end
+isstatic(diffusion::AbstractStaticDiffusion) = true
+isdynamic(diffusion::AbstractStaticDiffusion) = false
+isstatic(diffusion::AbstractDynamicDiffusion) = false
+isdynamic(diffusion::AbstractDynamicDiffusion) = true
+
+estimate_global_diffusion(diffusion::AbstractDynamicDiffusion, d, q, Eltype) =
+    error("Not possible or not implemented")
+
+"""
+    DynamicDiffusion()
+
+Time-varying, isotropic diffusion, which is quasi-maximum-likelihood-estimated at each step.
+
+**This is the recommended diffusion when using adaptive step-size selection,** and in
+particular also when solving stiff systems.
+"""
+struct DynamicDiffusion <: AbstractDynamicDiffusion end
+initial_diffusion(::DynamicDiffusion, d, q, Eltype) = one(Eltype) * Eye(d)
+estimate_local_diffusion(::DynamicDiffusion, integ) = local_scalar_diffusion(integ.cache)
+
+"""
+    DynamicMVDiffusion()
+
+Time-varying, diagonal diffusion, which is quasi-maximum-likelihood-estimated at each step.
+
+**Only works with the [`EK0`](@ref)!**
+
+A multi-variate version of [`DynamicDiffusion`](@ref), where instead of an isotropic matrix,
+a diagonal matrix is estimated. This can be helpful to get more expressive posterior
+covariances when using the [`EK0`](@ref), since the individual dimensions can be adjusted
+separately.
+
+# References
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+struct DynamicMVDiffusion <: AbstractDynamicDiffusion end
+initial_diffusion(::DynamicMVDiffusion, d, q, Eltype) = Diagonal(ones(Eltype, d))
+estimate_local_diffusion(::DynamicMVDiffusion, integ) =
+    local_diagonal_diffusion(integ.cache)
+
+"""
+    FixedDiffusion(; initial_diffusion=1.0, calibrate=true)
+
+Time-fixed, isotropic diffusion, which is (optionally) quasi-maximum-likelihood-estimated.
+
+**This is the recommended diffusion when using fixed steps.**
+
+By default with `calibrate=true`, all covariances are re-scaled at the end of the solve
+with the MLE diffusion. Set `calibrate=false` to skip this step, e.g. when setting the
+`initial_diffusion` and then estimating the diffusion outside of the solver
+(e.g. with [Fenrir.jl](https://github.com/nathanaelbosch/Fenrir.jl)).
+"""
+Base.@kwdef struct FixedDiffusion{T<:Number} <: AbstractStaticDiffusion
+    initial_diffusion::T = 1.0
+    calibrate::Bool = true
+end
+initial_diffusion(diffusionmodel::FixedDiffusion, d, q, Eltype) =
+    diffusionmodel.initial_diffusion * one(Eltype) * Eye(d)
+estimate_local_diffusion(::FixedDiffusion, integ) = local_scalar_diffusion(integ.cache)
+
+"""
+    FixedMVDiffusion(; initial_diffusion=1.0, calibrate=true)
+
+Time-fixed, diagonal diffusion, which is quasi-maximum-likelihood-estimated at each step.
+
+**Only works with the [`EK0`](@ref)!**
+
+A multi-variate version of [`FixedDiffusion`](@ref), where instead of an isotropic matrix,
+a diagonal matrix is estimated. This can be helpful to get more expressive posterior
+covariances when using the [`EK0`](@ref), since the individual dimensions can be adjusted
+separately.
+
+# References
+* [bosch20capos](@cite) Bosch et al, "Calibrated Adaptive Probabilistic ODE Solvers", AISTATS (2021)
+"""
+Base.@kwdef struct FixedMVDiffusion{T} <: AbstractStaticDiffusion
+    initial_diffusion::T = 1.0
+    calibrate::Bool = true
+end
+function initial_diffusion(diffusionmodel::FixedMVDiffusion, d, q, Eltype)
+    initdiff = diffusionmodel.initial_diffusion
+    if initdiff isa Number
+        return initdiff * one(Eltype) * I(d)
+    elseif initdiff isa AbstractVector
+        @assert length(initdiff) == d
+        return Diagonal(initdiff)
+    elseif initdiff isa Diagonal
+        @assert size(initdiff) == (d, d)
+        return initdiff
+    else
+        throw(
+            ArgumentError(
+                "Invalid `initial_diffusion`. The `FixedMVDiffusion` assumes a dxd diagonal diffusion model. So, pass either a Number, a Vector of length d, or a `Diagonal`.",
+            ),
+        )
+    end
+end
+estimate_local_diffusion(::FixedMVDiffusion, integ) =
+    integ.alg isa EK0 ? local_diagonal_diffusion(integ.cache) :
+    local_scalar_diffusion(integ.cache)
diff --git a/src/fast_linalg.jl b/src/fast_linalg.jl
index b81ad9924..cb3a549e1 100644
--- a/src/fast_linalg.jl
+++ b/src/fast_linalg.jl
@@ -13,46 +13,34 @@ _matmul!(C, A, B)
 _matmul!(C, A, B) = mul!(C, A, B)
 _matmul!(C, A, B, a, b) = mul!(C, A, B, a, b)
 # Use Octavian.jl's matrix-matrix products whenever applicable
+const MSR{T} = Union{Matrix{T},SubArray{T},Base.ReshapedArray{T},Adjoint{T,<:Matrix}}
 _matmul!(
-    C::AbstractMatrix{T},
-    A::AbstractMatrix{T},
-    B::AbstractMatrix{T},
+    C::MSR{T},
+    A::MSR{T},
+    B::MSR{T},
     alpha::Number,
     beta::Number,
 ) where {T<:LinearAlgebra.BlasFloat} = matmul!(C, A, B, alpha, beta)
 _matmul!(
-    C::AbstractMatrix{T},
-    A::AbstractMatrix{T},
-    B::AbstractMatrix{T},
+    C::MSR{T},
+    A::MSR{T},
+    B::MSR{T},
 ) where {T<:LinearAlgebra.BlasFloat} = matmul!(C, A, B)
 # Some exceptions where we'd rather broadcast with FastBroadcast.jl:
 # Matrix-scalar products
 _matmul!(C::AbstractVecOrMat, A::AbstractVecOrMat, b::Number) = @.. C = A * b
 _matmul!(C::AbstractVecOrMat, a::Number, B::AbstractVecOrMat) = @.. C = a * B
 # Matrix matrix products with diagonal matrices
-_matmul!(C::AbstractMatrix, A::AbstractMatrix, B::Diagonal) = (@.. C = A * B.diag')
-_matmul!(C::AbstractMatrix, A::Diagonal, B::AbstractMatrix) = (@.. C = A.diag * B)
-_matmul!(C::AbstractMatrix, A::Diagonal, B::Diagonal) = @.. C = A * B
-_matmul!(
-    C::AbstractMatrix{T},
-    A::AbstractMatrix{T},
-    B::Diagonal{T},
-) where {T<:LinearAlgebra.BlasFloat} = (@.. C = A * B.diag')
-_matmul!(
-    C::AbstractMatrix{T},
-    A::Diagonal{T},
-    B::AbstractMatrix{T},
-) where {T<:LinearAlgebra.BlasFloat} = (@.. C = A.diag * B)
-_matmul!(
-    C::AbstractMatrix{T},
-    A::Diagonal{T},
-    B::Diagonal{T},
-) where {T<:LinearAlgebra.BlasFloat} = @.. C = A * B
-_matmul!(
-    C::AbstractMatrix{T},
-    A::LowerTriangular{T},
-    B::UpperTriangular{T},
-) where {T<:LinearAlgebra.BlasFloat} = mul!(C, A, B)
+_matmul!(C::MSR, A::MSR, B::Diagonal) =
+    @.. C = A * B.diag'
+_matmul!(C::MSR, A::Diagonal, B::MSR) =
+    (@.. C = A.diag * B)
+_matmul!(C::MSR, A::Diagonal, B::Diagonal) =
+    (@.. C = A * B)
+_matmul!(C::MSR, A::MSR, B::Diagonal, alpha::Number, beta::Number) =
+    @.. C = A * B.diag' * alpha + C * beta
+_matmul!(C::MSR, A::Diagonal, B::MSR, alpha::Number, beta::Number) =
+    (@.. C = A.diag * B * alpha + C * beta)
 
 """
     getupperright!(A)
diff --git a/src/filtering/markov_kernel.jl b/src/filtering/markov_kernel.jl
index e980108dd..07873ede1 100644
--- a/src/filtering/markov_kernel.jl
+++ b/src/filtering/markov_kernel.jl
@@ -120,6 +120,28 @@ function marginalize_cov!(
     return marginalize_cov!(_Σ_out, _Σ_curr, _K; C_DxD=_C_DxD, C_3DxD=_C_3DxD)
 end
 
+function marginalize_cov!(
+    Σ_out::PSDMatrix{T,<:BlockDiag},
+    Σ_curr::PSDMatrix{T,<:BlockDiag},
+    K::AffineNormalKernel{
+        <:AbstractMatrix,
+        <:Any,
+        <:PSDMatrix{S,<:BlockDiag},
+    };
+    C_DxD::AbstractMatrix,
+    C_3DxD::AbstractMatrix,
+) where {T,S}
+    @simd ivdep for i in eachindex(blocks(Σ_out.R))
+        _Σ_out = PSDMatrix(Σ_out.R.blocks[i])
+        _Σ_curr = PSDMatrix(Σ_curr.R.blocks[i])
+        _K = AffineNormalKernel(K.A.blocks[i], K.b, PSDMatrix(K.C.R.blocks[i]))
+        _C_DxD = C_DxD.blocks[i]
+        _C_3DxD = C_3DxD.blocks[i]
+        marginalize_cov!(_Σ_out, _Σ_curr, _K; C_DxD=_C_DxD, C_3DxD=_C_3DxD)
+    end
+    return Σ_out
+end
+
 """
     compute_backward_kernel!(Kout, xpred, x, K; C_DxD[, diffusion=1])
 
@@ -200,11 +222,11 @@ function compute_backward_kernel!(
     end
     _matmul!(view(Λ.R, 1:D, 1:D), x.Σ.R, C_DxD)
     # Λ.R[D+1:2D, 1:D] = (G * Q.R')'
-    if !isone(diffusion)
-        _matmul!(C_DxD, Q.R, sqrt.(diffusion))
-        _matmul!(view(Λ.R, D+1:2D, 1:D), C_DxD, G')
-    else
+    if isone(diffusion)
         _matmul!(view(Λ.R, D+1:2D, 1:D), Q.R, G')
+    else
+        apply_diffusion!(PSDMatrix(C_DxD), Q, diffusion)
+        _matmul!(view(Λ.R, D+1:2D, 1:D), C_DxD, G')
     end
 
     return Kout
@@ -216,7 +238,7 @@ function compute_backward_kernel!(
     x::SRGaussian{T,<:IsometricKroneckerProduct},
     K::KT2;
     C_DxD::AbstractMatrix,
-    diffusion=1,
+    diffusion::Union{Number,Diagonal}=1,
 ) where {
     T,
     KT1<:AffineNormalKernel{
@@ -239,7 +261,61 @@ function compute_backward_kernel!(
     _x = Gaussian(reshape_no_alloc(x.μ, Q, d), PSDMatrix(x.Σ.R.B))
     _K = AffineNormalKernel(K.A.B, reshape_no_alloc(K.b, Q, d), PSDMatrix(K.C.R.B))
     _C_DxD = C_DxD.B
+    _diffusion =
+        diffusion isa Number ? diffusion :
+        diffusion isa IsometricKroneckerProduct ? diffusion.B : diffusion
 
     return compute_backward_kernel!(
-        _Kout, _x_pred, _x, _K; C_DxD=_C_DxD, diffusion=diffusion)
+        _Kout, _x_pred, _x, _K; C_DxD=_C_DxD, diffusion=_diffusion)
+end
+
+function compute_backward_kernel!(
+    Kout::KT1,
+    xpred::SRGaussian{T,<:BlockDiag},
+    x::SRGaussian{T,<:BlockDiag},
+    K::KT2;
+    C_DxD::AbstractMatrix,
+    diffusion::Union{Number,Diagonal}=1,
+) where {
+    T,
+    KT1<:AffineNormalKernel{
+        <:BlockDiag,
+        <:AbstractVector,
+        <:PSDMatrix{T,<:BlockDiag},
+    },
+    KT2<:AffineNormalKernel{
+        <:BlockDiag,
+        <:Any,
+        <:PSDMatrix{T,<:BlockDiag},
+    },
+}
+    d = length(blocks(xpred.Σ.R))
+    q = size(blocks(xpred.Σ.R)[1], 1) - 1
+
+    @simd ivdep for i in eachindex(blocks(xpred.Σ.R))
+        _Kout = AffineNormalKernel(
+            Kout.A.blocks[i],
+            view(Kout.b, (i-1)*(q+1)+1:i*(q+1)),
+            PSDMatrix(Kout.C.R.blocks[i]),
+        )
+        _xpred = Gaussian(
+            view(xpred.μ, (i-1)*(q+1)+1:i*(q+1)),
+            PSDMatrix(xpred.Σ.R.blocks[i]),
+        )
+        _x = Gaussian(
+            view(x.μ, (i-1)*(q+1)+1:i*(q+1)),
+            PSDMatrix(x.Σ.R.blocks[i]),
+        )
+        _K = AffineNormalKernel(
+            K.A.blocks[i],
+            ismissing(K.b) ? missing : view(K.b, (i-1)*(q+1)+1:i*(q+1)),
+            PSDMatrix(K.C.R.blocks[i]),
+        )
+        _C_DxD = C_DxD.blocks[i]
+        _diffusion = diffusion isa Number ? diffusion : diffusion.diag[i]
+        compute_backward_kernel!(
+            _Kout, _xpred, _x, _K, C_DxD=_C_DxD, diffusion=_diffusion,
+        )
+    end
+    return Kout
 end
diff --git a/src/filtering/predict.jl b/src/filtering/predict.jl
index 8738d74fa..b37fc5bd3 100644
--- a/src/filtering/predict.jl
+++ b/src/filtering/predict.jl
@@ -66,7 +66,7 @@ function predict_cov!(
     Qh::PSDMatrix,
     C_DxD::AbstractMatrix,
     C_2DxD::AbstractMatrix,
-    diffusion=1,
+    diffusion::Union{Number,Diagonal},
 )
     if iszero(diffusion)
         fast_X_A_Xt!(Σ_out, Σ_curr, Ah)
@@ -76,10 +76,10 @@ function predict_cov!(
     D = size(Qh, 1)
 
     _matmul!(view(R, 1:D, 1:D), Σ_curr.R, Ah')
-    if !isone(diffusion)
-        _matmul!(view(R, D+1:2D, 1:D), Qh.R, sqrt.(diffusion))
-    else
+    if isone(diffusion)
         @.. R[D+1:2D, 1:D] = Qh.R
+    else
+        apply_diffusion!(PSDMatrix(view(R, D+1:2D, 1:D)), Qh, diffusion)
     end
     _matmul!(M, R', R)
     chol = cholesky!(Symmetric(M), check=false)
@@ -101,7 +101,7 @@ function predict_cov!(
     Qh::PSDMatrix{S,<:IsometricKroneckerProduct},
     C_DxD::IsometricKroneckerProduct,
     C_2DxD::IsometricKroneckerProduct,
-    diffusion=1,
+    diffusion::Union{Number,Diagonal},
 ) where {T,S}
     _Σ_out = PSDMatrix(Σ_out.R.B)
     _Σ_curr = PSDMatrix(Σ_curr.R.B)
@@ -109,7 +109,33 @@ function predict_cov!(
     _Qh = PSDMatrix(Qh.R.B)
     _C_DxD = C_DxD.B
     _C_2DxD = C_2DxD.B
-    _diffusion = diffusion isa IsometricKroneckerProduct ? diffusion.B : diffusion
+    _diffusion =
+        diffusion isa Number ? diffusion :
+        diffusion isa IsometricKroneckerProduct ? diffusion.B : diffusion
 
     return predict_cov!(_Σ_out, _Σ_curr, _Ah, _Qh, _C_DxD, _C_2DxD, _diffusion)
 end
+
+# BlockDiagonal version
+function predict_cov!(
+    Σ_out::PSDMatrix{T,<:BlockDiag},
+    Σ_curr::PSDMatrix{T,<:BlockDiag},
+    Ah::BlockDiag,
+    Qh::PSDMatrix{S,<:BlockDiag},
+    C_DxD::BlockDiag,
+    C_2DxD::BlockDiag,
+    diffusion::Union{Number,Diagonal},
+) where {T,S}
+    @simd ivdep for i in eachindex(blocks(Σ_out.R))
+        predict_cov!(
+            PSDMatrix(Σ_out.R.blocks[i]),
+            PSDMatrix(Σ_curr.R.blocks[i]),
+            Ah.blocks[i],
+            PSDMatrix(Qh.R.blocks[i]),
+            C_DxD.blocks[i],
+            C_2DxD.blocks[i],
+            diffusion isa Number ? diffusion : diffusion.diag[i],
+        )
+    end
+    return Σ_out
+end
diff --git a/src/filtering/update.jl b/src/filtering/update.jl
index 3d9125fcd..4e4468480 100644
--- a/src/filtering/update.jl
+++ b/src/filtering/update.jl
@@ -193,6 +193,50 @@ function update!(
     return x_out, loglikelihood
 end
 
+function update!(
+    x_out::SRGaussian{T,<:BlockDiag},
+    x_pred::SRGaussian{T,<:BlockDiag},
+    measurement::Gaussian{
+        <:AbstractVector,
+        <:Union{<:PSDMatrix{T,<:BlockDiag},<:BlockDiag},
+    },
+    H::BlockDiag,
+    K1_cache::BlockDiag,
+    K2_cache::BlockDiag,
+    M_cache::BlockDiag,
+    C_dxd::BlockDiag,
+    C_d::AbstractVector;
+    R::Union{Nothing,PSDMatrix{T,<:BlockDiag}}=nothing,
+) where {T}
+    d = length(blocks(x_out.Σ.R))
+    q = size(blocks(x_out.Σ.R)[1], 1) - 1
+
+    ll = zero(eltype(x_out.μ))
+    for i in eachindex(blocks(x_out.Σ.R))
+        _, _ll = update!(
+            Gaussian(view(x_out.μ, (i-1)*(q+1)+1:i*(q+1)),
+                PSDMatrix(x_out.Σ.R.blocks[i])),
+            Gaussian(view(x_pred.μ, (i-1)*(q+1)+1:i*(q+1)),
+                PSDMatrix(x_pred.Σ.R.blocks[i])),
+            Gaussian(view(measurement.μ, i:i),
+                if measurement.Σ isa PSDMatrix
+                    PSDMatrix(measurement.Σ.R.blocks[i])
+                else
+                    measurement.Σ.blocks[i]
+                end),
+            H.blocks[i],
+            K1_cache.blocks[i],
+            K2_cache.blocks[i],
+            M_cache.blocks[i],
+            C_dxd.blocks[i],
+            view(C_d, i:i);
+            R=isnothing(R) ? nothing : PSDMatrix(blocks(R.R)[i]),
+        )
+        ll += _ll
+    end
+    return x_out, ll
+end
+
 # Short-hand with cache
 function update!(x_out, x, measurement, H; cache, R=nothing)
     @unpack K1, m_tmp, C_DxD, C_dxd, C_Dxd, C_d = cache
diff --git a/src/initialization/classicsolverinit.jl b/src/initialization/classicsolverinit.jl
index e7bd1547f..123ddadee 100644
--- a/src/initialization/classicsolverinit.jl
+++ b/src/initialization/classicsolverinit.jl
@@ -90,7 +90,7 @@ end
 function rk_init_improve(cache::AbstractODEFilterCache, ts, us, dt)
     @unpack A, Q = cache
     # @unpack Ah, Qh = cache
-    @unpack x, x_pred, x_filt, measurement = cache
+    @unpack x, x_pred, x_filt, measurement, x_tmp = cache
     @unpack K1, C_Dxd, C_DxD, C_dxd, C_3DxD, C_d = cache
     @unpack backward_kernel = cache
 
@@ -98,7 +98,7 @@ function rk_init_improve(cache::AbstractODEFilterCache, ts, us, dt)
     make_preconditioners!(cache, dt)
     @unpack P, PI = cache
 
-    _gaussian_mul!(x, P, x)
+    _gaussian_mul!(x, P, copy!(x_tmp, x))
 
     preds = []
     filts = [copy(x)]
diff --git a/src/integrator_utils.jl b/src/integrator_utils.jl
index e13c9e4b8..b27e0b60c 100644
--- a/src/integrator_utils.jl
+++ b/src/integrator_utils.jl
@@ -47,13 +47,12 @@ function calibrate_solution!(integ, mle_diffusion)
     set_diffusions!(integ.sol, mle_diffusion * integ.cache.default_diffusion)
 
     # Rescale all filtering estimates to have the correct diffusion
-    @assert mle_diffusion isa Number || mle_diffusion isa Diagonal
-    sqrt_diff = mle_diffusion isa Number ? sqrt(mle_diffusion) : sqrt.(mle_diffusion)
+    @assert mle_diffusion isa Diagonal
     @simd ivdep for C in integ.sol.x_filt.Σ
-        rmul!(C.R, sqrt_diff)
+        apply_diffusion!(C, mle_diffusion)
     end
     @simd ivdep for C in integ.sol.backward_kernels.C
-        rmul!(C.R, sqrt_diff)
+        apply_diffusion!(C, mle_diffusion)
     end
 
     # Re-write into the solution estimates
@@ -70,13 +69,17 @@ Set the contents of `solution.diffusions` to the provided `diffusion`, overwriti
 diffusion estimates that are in there. Typically, `diffusion` is either a global quasi-MLE
 or the specified initial diffusion value if no calibration is desired.
 """
-function set_diffusions!(solution::AbstractProbODESolution, diffusion::Number)
-    solution.diffusions .= diffusion
-    return nothing
-end
-function set_diffusions!(solution::AbstractProbODESolution, diffusion::Diagonal)
-    @simd ivdep for d in solution.diffusions
-        copy!(d, diffusion)
+function set_diffusions!(solution::AbstractProbODESolution, diffusion)
+    if diffusion isa Diagonal{<:Number,<:FillArrays.Fill}
+        @simd ivdep for i in eachindex(solution.diffusions)
+            solution.diffusions[i] = copy(diffusion)
+        end
+    elseif diffusion isa Diagonal{<:Number,<:Vector}
+        @simd ivdep for d in solution.diffusions
+            copy!(d, diffusion)
+        end
+    else
+        throw(ArgumentError("unexpected diffusion type $(typeof(diffusion))"))
     end
     return nothing
 end
diff --git a/src/kronecker.jl b/src/kronecker.jl
index 7c6d71a48..f18bffd74 100644
--- a/src/kronecker.jl
+++ b/src/kronecker.jl
@@ -62,6 +62,7 @@ end
 Base.:*(K::IKP, a::Number) = IsometricKroneckerProduct(K.ldim, K.B * a)
 Base.:*(a::Number, K::IKP) = IsometricKroneckerProduct(K.ldim, a * K.B)
 LinearAlgebra.adjoint(A::IKP) = IsometricKroneckerProduct(A.ldim, A.B')
+LinearAlgebra.rmul!(A::IKP, b::Number) = IsometricKroneckerProduct(A.ldim, rmul!(A.B, b))
 
 function check_same_size(A::IKP, B::IKP)
     if A.ldim != B.ldim || size(A.B) != size(B.B)
@@ -102,6 +103,13 @@ Base.:+(A::IKP, B::IKP) = begin
 end
 Base.:+(U::UniformScaling, K::IKP) = IsometricKroneckerProduct(K.ldim, U + K.B)
 Base.:+(K::IKP, U::UniformScaling) = IsometricKroneckerProduct(K.ldim, U + K.B)
+
+add!(out::IsometricKroneckerProduct, toadd::IsometricKroneckerProduct) = begin
+    @assert out.ldim == toadd.ldim
+    add!(out.B, toadd.B)
+    return out
+end
+
 Base.:-(U::UniformScaling, K::IKP) = IsometricKroneckerProduct(K.ldim, U - K.B)
 LinearAlgebra.inv(K::IKP) = IsometricKroneckerProduct(K.ldim, inv(K.B))
 Base.:/(A::IKP, B::IKP) = begin
@@ -152,6 +160,27 @@ _matmul!(
     return A
 end
 
+mul!(A::IKP, b::Number, C::IKP) = begin
+    check_matmul_sizes(A, C)
+    mul!(A.B, b, C.B)
+    return A
+end
+mul!(A::IKP, B::IKP, c::Number) = begin
+    check_matmul_sizes(A, B)
+    mul!(A.B, B.B, c)
+    return A
+end
+_matmul!(A::IKP, b::Number, C::IKP) = begin
+    check_matmul_sizes(A, C)
+    _matmul!(A.B, b, C.B)
+    return A
+end
+_matmul!(A::IKP, B::IKP, c::Number) = begin
+    check_matmul_sizes(A, B)
+    _matmul!(A.B, B.B, c)
+    return A
+end
+
 """
 Allocation-free reshape
 Found here: https://discourse.julialang.org/t/convert-array-into-matrix-in-place/55624/5
diff --git a/src/perform_step.jl b/src/perform_step.jl
index 29574b68a..d1c834284 100644
--- a/src/perform_step.jl
+++ b/src/perform_step.jl
@@ -124,7 +124,7 @@ function OrdinaryDiffEq.perform_step!(integ, cache::EKCache, repeat_step=false)
 
     # Update the global diffusion MLE (if applicable)
     if !isdynamic(cache.diffusionmodel)
-        cache.global_diffusion = estimate_global_diffusion(cache.diffusionmodel, integ)
+        estimate_global_diffusion(cache.diffusionmodel, integ)
     end
 
     # Advance the state
@@ -163,7 +163,7 @@ compute_measurement_covariance!(cache) = begin
     _matmul!(cache.C_Dxd, cache.x_pred.Σ.R, cache.H')
     _matmul!(cache.measurement.Σ, cache.C_Dxd', cache.C_Dxd)
     if !isnothing(cache.R)
-        cache.measurement.Σ .+= _matmul!(cache.C_dxd, cache.R.R', cache.R.R)
+        add!(cache.measurement.Σ, _matmul!(cache.C_dxd, cache.R.R', cache.R.R))
     end
 end
 
@@ -218,35 +218,22 @@ To save allocations, the function modifies the given `cache` and writes into
 `cache.C_Dxd` during some computations.
 """
 function estimate_errors!(cache::AbstractODEFilterCache)
-    @unpack local_diffusion, Qh, H, d = cache
-
-    R = cache.C_Dxd
-
-    if local_diffusion isa Diagonal
-        _QR = cache.C_DxD .= Qh.R .* sqrt.(local_diffusion.diag)'
-        _matmul!(R, _QR, H')
-        error_estimate = view(cache.tmp, 1:d)
-        sum!(abs2, error_estimate', view(R, :, 1:d))
-        error_estimate .= sqrt.(error_estimate)
-        return error_estimate
-    elseif local_diffusion isa Number
-        _matmul!(R, Qh.R, H')
-
-        # error_estimate = diag(PSDMatrix(R))
-        # error_estimate .*= local_diffusion
-        # error_estimate .= sqrt.(error_estimate)
-        # error_estimate = view(error_estimate, 1:d)
-
-        # faster:
-        error_estimate = view(cache.tmp, 1:d)
-        if R isa IsometricKroneckerProduct
-            error_estimate .= sum(abs2, R.B)
-        else
-            sum!(abs2, error_estimate', view(R, :, 1:d))
-        end
-        error_estimate .*= local_diffusion
-        error_estimate .= sqrt.(error_estimate)
+    @unpack local_diffusion, Qh, H, C_d, C_Dxd, C_DxD = cache
+    _Q = apply_diffusion!(PSDMatrix(C_DxD), Qh, local_diffusion)
+    _HQH = PSDMatrix(_matmul!(C_Dxd, _Q.R, H'))
+    error_estimate = diag!(C_d, _HQH)
+    @.. error_estimate = sqrt(error_estimate)
+    return error_estimate
+end
 
-        return error_estimate
+diag!(v::AbstractVector, M::PSDMatrix) = (sum!(abs2, v', M.R); v)
+diag!(v::AbstractVector, M::PSDMatrix{<:Number,<:IsometricKroneckerProduct}) =
+    v .= sum(abs2, M.R.B)
+diag!(v::AbstractVector, M::PSDMatrix{<:Number,<:BlockDiag}) = begin
+    @assert length(v) == nblocks(M.R)
+    @assert size(blocks(M.R)[1], 2) == 1 # assumes all of them have the same shape
+    @simd ivdep for i in eachindex(blocks(M.R))
+        v[i] = sum(abs2, blocks(M.R)[i])
     end
+    return v
 end
diff --git a/src/preconditioning.jl b/src/preconditioning.jl
index 8dfd9fac8..ad0e494a4 100644
--- a/src/preconditioning.jl
+++ b/src/preconditioning.jl
@@ -8,6 +8,13 @@ function init_preconditioner(C::DenseCovariance{elType}) where {elType}
     PI = kron(I(C.d), Diagonal(ones(elType, C.q + 1)))
     return P, PI
 end
+function init_preconditioner(C::BlockDiagonalCovariance{elType}) where {elType}
+    B = Diagonal(ones(elType, C.q + 1))
+    P = BlockDiag([B for _ in 1:C.d])
+    BI = Diagonal(ones(elType, C.q + 1))
+    PI = BlockDiag([BI for _ in 1:C.d])
+    return P, PI
+end
 
 function make_preconditioners!(cache, dt)
     @unpack P, PI, d, q = cache
@@ -31,15 +38,10 @@ end
     end
     return P
 end
-
-@fastmath @inbounds function make_preconditioner!(P::IsometricKroneckerProduct, h, d, q)
-    val = factorial(q) / h^(q + 1 / 2)
-    @simd ivdep for j in 0:q
-        P.B.diag[j+1] = val
-        val /= (q - j) / h
-    end
-    return P
-end
+make_preconditioner!(P::IsometricKroneckerProduct, h, d, q) =
+    (make_preconditioner!(P.B, h, 1, q); P)
+make_preconditioner!(P::BlockDiag, h, d, q) =
+    (make_preconditioner!(blocks(P)[1], h, 1, q); P)
 
 @fastmath @inbounds function make_preconditioner_inv!(PI::Diagonal, h, d, q)
     val = h^(q + 1 / 2) / factorial(q)
@@ -52,13 +54,7 @@ end
     end
     return PI
 end
-
-@fastmath @inbounds function make_preconditioner_inv!(
-    PI::IsometricKroneckerProduct, h, d, q)
-    val = h^(q + 1 / 2) / factorial(q)
-    @simd ivdep for j in 0:q
-        PI.B.diag[j+1] = val
-        val *= (q - j) / h
-    end
-    return PI
-end
+make_preconditioner_inv!(PI::IsometricKroneckerProduct, h, d, q) =
+    (make_preconditioner_inv!(PI.B, h, 1, q); PI)
+make_preconditioner_inv!(PI::BlockDiag, h, d, q) =
+    (make_preconditioner_inv!(blocks(PI)[1], h, 1, q); PI)
diff --git a/src/priors/iwp.jl b/src/priors/iwp.jl
index 3a7d7f311..43bc2397e 100644
--- a/src/priors/iwp.jl
+++ b/src/priors/iwp.jl
@@ -158,8 +158,8 @@ end
 function initialize_transition_matrices(FAC::IsometricKroneckerCovariance, p::IWP, dt)
     A, Q = preconditioned_discretize(p)
     P, PI = initialize_preconditioner(FAC, p, dt)
-    Ah = PI * A * P
-    Qh = PSDMatrix(Q.R * PI)
+    Ah = copy(A)
+    Qh = copy(Q)
     return A, Q, Ah, Qh, P, PI
 end
 function initialize_transition_matrices(FAC::DenseCovariance, p::IWP, dt)
@@ -169,6 +169,15 @@ function initialize_transition_matrices(FAC::DenseCovariance, p::IWP, dt)
     Ah, Qh = copy(A), copy(Q)
     return A, Q, Ah, Qh, P, PI
 end
+function initialize_transition_matrices(FAC::BlockDiagonalCovariance, p::IWP, dt)
+    A, Q = preconditioned_discretize(p)
+    A = to_factorized_matrix(FAC, A)
+    Q = to_factorized_matrix(FAC, Q)
+    P, PI = initialize_preconditioner(FAC, p, dt)
+    Ah = copy(A)
+    Qh = copy(Q)
+    return A, Q, Ah, Qh, P, PI
+end
 
 function make_transition_matrices!(cache, prior::IWP, dt)
     @unpack A, Q, Ah, Qh, P, PI = cache
diff --git a/src/priors/ltisde.jl b/src/priors/ltisde.jl
index 6f7f14d57..059788ac0 100644
--- a/src/priors/ltisde.jl
+++ b/src/priors/ltisde.jl
@@ -60,44 +60,8 @@ function matrix_fraction_decomposition(
 )
     d = size(drift, 1)
     M = [drift dispersion*dispersion'; zero(drift) -drift']
-    Mexp = exponential!(dt * M)
+    Mexp = exp(dt * M)
     A = Mexp[1:d, 1:d]
     Q = Mexp[1:d, d+1:end] * A'
     return A, Q
 end
-
-# Previous implementation, outdated thanks to FiniteHorizonGramians.jl:
-function _discretize_sqrt_with_quadraturetrick(sde::LTISDE, dt::Real)
-    F, L = drift(sde), dispersion(sde)
-
-    D = size(F, 1)
-    d = size(L, 2)
-    N = D # more robust than Int(D / d)
-    R = similar(F, N * d, D)
-    method = ExpMethodHigham2005()
-    expcache = ExponentialUtilities.alloc_mem(F, method)
-
-    Ah = exponential!(dt * F, method, expcache)
-
-    chol_integrand(τ) = begin
-        E = exponential!((dt - τ) * F', method, expcache)
-        L'E
-    end
-    nodes, weights = gausslegendre(N)
-    b, a = dt, 0
-    @. nodes = (b - a) / 2 * nodes + (a + b) / 2
-    @. weights = (b - a) / 2 * weights
-    @simd ivdep for i in 1:N
-        R[(i-1)*d+1:i*d, 1:D] .= sqrt(weights[i]) .* chol_integrand(nodes[i])
-    end
-
-    M = R'R |> Symmetric
-    chol = cholesky!(M, check=false)
-    Qh_R = if issuccess(chol)
-        chol.U |> Matrix
-    else
-        qr!(R).R |> Matrix
-    end
-
-    return Ah, Qh_R
-end
diff --git a/src/projection.jl b/src/projection.jl
index 29097b253..ba08e65e0 100644
--- a/src/projection.jl
+++ b/src/projection.jl
@@ -30,6 +30,17 @@ function projection(C::IsometricKroneckerCovariance{elType}) where {elType}
     return Proj
 end
 
+function projection(C::BlockDiagonalCovariance{elType}) where {elType}
+    Proj(deriv) = begin
+        e_i = zeros(elType, C.q + 1, 1)
+        if deriv <= C.q
+            e_i[deriv+1] = 1
+        end
+        return BlockDiag([copy(e_i)' for _ in 1:C.d])
+    end
+    return Proj
+end
+
 function solution_space_projection(C::CovarianceStructure, is_secondorder_ode)
     Proj = projection(C)
     if is_secondorder_ode
@@ -47,6 +58,14 @@ function solution_space_projection(C::IsometricKroneckerCovariance, is_secondord
         return Proj(0)
     end
 end
+function solution_space_projection(C::BlockDiagonalCovariance, is_secondorder_ode)
+    Proj = projection(C)
+    if is_secondorder_ode
+        error("Not yet implemented!")
+    else
+        return Proj(0)
+    end
+end
 
 struct KroneckerSecondOrderODESolutionProjector{T,FAC,M,M2} <: AbstractMatrix{T}
     covariance_structure::FAC
diff --git a/test/Project.toml b/test/Project.toml
index f0fff40ee..4705022bd 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Aqua = "4c88cf16-eb10-579e-8560-4a9242c79595"
+BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 DiffEqDevTools = "f3b72e0c-5b89-59e1-b016-84e28bfd966d"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 FiniteDiff = "6a86dc24-6348-571c-b903-95158fe2bd41"
@@ -24,4 +25,3 @@ TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 
 [compat]
 Aqua = "0.8.2"
-DiffEqDevTools = "2.44.1"
diff --git a/test/autodiff.jl b/test/autodiff.jl
index de5dbef83..235478b23 100644
--- a/test/autodiff.jl
+++ b/test/autodiff.jl
@@ -9,48 +9,49 @@ using ForwardDiff
 
 import ODEProblemLibrary: prob_ode_fitzhughnagumo
 
-const _prob = prob_ode_fitzhughnagumo
-const prob = ODEProblem(modelingtoolkitize(_prob), _prob.u0, _prob.tspan, jac=true)
+@testset "solver: $ALG" for ALG in (EK0, EK1, DiagonalEK1)
+    _prob = prob_ode_fitzhughnagumo
+    prob = ODEProblem(modelingtoolkitize(_prob), _prob.u0, _prob.tspan, jac=true)
+    function param_to_loss(p)
+        sol = solve(
+            remake(prob, p=p),
+            ALG(order=3, smooth=false),
+            sensealg=SensitivityADPassThrough(),
+            abstol=1e-3,
+            reltol=1e-2,
+            save_everystep=false,
+            dense=false,
+        )
+        return norm(sol.u[end])  # Dummy loss
+    end
+    function startval_to_loss(u0)
+        sol = solve(
+            remake(prob, u0=u0),
+            ALG(order=3, smooth=false),
+            sensealg=SensitivityADPassThrough(),
+            abstol=1e-3,
+            reltol=1e-2,
+            save_everystep=false,
+            dense=false,
+        )
+        return norm(sol.u[end])  # Dummy loss
+    end
 
-function param_to_loss(p)
-    sol = solve(
-        remake(prob, p=p),
-        EK1(order=3, smooth=false),
-        sensealg=SensitivityADPassThrough(),
-        abstol=1e-3,
-        reltol=1e-2,
-        save_everystep=false,
-        dense=false,
-    )
-    return norm(sol.u[end])  # Dummy loss
-end
-function startval_to_loss(u0)
-    sol = solve(
-        remake(prob, u0=u0),
-        EK1(order=3, smooth=false),
-        sensealg=SensitivityADPassThrough(),
-        abstol=1e-3,
-        reltol=1e-2,
-        save_everystep=false,
-        dense=false,
-    )
-    return norm(sol.u[end])  # Dummy loss
-end
+    dldp = FiniteDiff.finite_difference_gradient(param_to_loss, prob.p)
+    dldu0 = FiniteDiff.finite_difference_gradient(startval_to_loss, prob.u0)
 
-const dldp = FiniteDiff.finite_difference_gradient(param_to_loss, prob.p)
-const dldu0 = FiniteDiff.finite_difference_gradient(startval_to_loss, prob.u0)
+    @testset "ForwardDiff.jl" begin
+        @test ForwardDiff.gradient(param_to_loss, prob.p) ≈ dldp rtol = 1e-3
+        @test ForwardDiff.gradient(startval_to_loss, prob.u0) ≈ dldu0 rtol = 5e-3
+    end
 
-@testset "ForwardDiff.jl" begin
-    @test ForwardDiff.gradient(param_to_loss, prob.p) ≈ dldp rtol = 1e-4
-    @test ForwardDiff.gradient(startval_to_loss, prob.u0) ≈ dldu0 rtol = 5e-4
-end
+    # @testset "ReverseDiff.jl" begin
+    #     @test_broken ReverseDiff.gradient(param_to_loss, prob.p) ≈ dldp
+    #     @test_broken ReverseDiff.gradient(startval_to_loss, prob.u0) ≈ dldu0
+    # end
 
-# @testset "ReverseDiff.jl" begin
-#     @test_broken ReverseDiff.gradient(param_to_loss, prob.p) ≈ dldp
-#     @test_broken ReverseDiff.gradient(startval_to_loss, prob.u0) ≈ dldu0
-# end
-
-# @testset "Zygote.jl" begin
-#     @test_broken Zygote.gradient(param_to_loss, prob.p) ≈ dldp
-#     @test_broken Zygote.gradient(startval_to_loss, prob.u0) ≈ dldu0
-# end
+    # @testset "Zygote.jl" begin
+    #     @test_broken Zygote.gradient(param_to_loss, prob.p) ≈ dldp
+    #     @test_broken Zygote.gradient(startval_to_loss, prob.u0) ≈ dldu0
+    # end
+end
diff --git a/test/complexity.jl b/test/complexity.jl
index 31482dea5..a87bbc471 100644
--- a/test/complexity.jl
+++ b/test/complexity.jl
@@ -7,98 +7,86 @@ using Test, SafeTestsets
 
 @testset "Scaling with ODE dimension" begin
     f(du, u, p, t) = mul!(du, -0.9I, u)
+    jac(J, u, p, t) = @simd ivdep for i in 1:size(J, 1)
+        J[i, i] = -0.9
+    end
     tspan = (0.0, 1.0)
     prob = ODEProblem(f, ones(1), tspan)
 
     NUMRUNS = 20
 
-    @testset "Order 1 + perfect init + no smoothing" begin
-        time_dim(d; Alg) = begin
-            _prob = remake(prob, u0=ones(d))
-            tmin = Inf
-            for _ in 1:NUMRUNS
-                integ = init(_prob,
-                    Alg(
-                        smooth=false,
-                        order=1,
-                        initialization=ClassicSolverInit(),
-                    ),
-                    dense=false, save_everystep=false,
-                    adaptive=false, dt=1e-2,
-                )
-                t = @elapsed solve!(integ)
-                tmin = min(tmin, t)
-            end
-            return tmin
+    _timer(d, alg; kwargs...) = begin
+        _prob = remake(
+            prob,
+            u0=ones(d),
+            f=ODEFunction(f; jac=jac, jac_prototype=Diagonal(ones(d))),
+        )
+        tmin = Inf
+        for _ in 1:NUMRUNS
+            integ = init(_prob, alg; adaptive=false, dt=1e-2, kwargs...)
+            t = @elapsed solve!(integ)
+            tmin = min(tmin, t)
         end
+        return tmin
+    end
 
-        dims_ek0 = 2 .^ (8:15)
-        times_ek0 = [time_dim(d; Alg=EK0) for d in dims_ek0]
-        dims_ek1 = 2 .^ (2:6)
-        times_ek1 = [time_dim(d; Alg=EK1) for d in dims_ek1]
+    @testset "Order 1 + perfect init + no smoothing" begin
+        t(d, Alg) = _timer(
+            d, Alg(smooth=false, order=1, initialization=ClassicSolverInit());
+            dense=false, save_everystep=false,
+        )
 
+        dims_ek0 = 2 .^ (8:15)
+        times_ek0 = [t(d, EK0) for d in dims_ek0]
         lr_ek0 = linregress(log.(dims_ek0), log.(times_ek0))
-        @test_skip slope(lr_ek0)[1] ≈ 1 atol = 0.1
-        @test 0.5 < slope(lr_ek0)[1] < 1.3
+        @test slope(lr_ek0)[1] ≈ 1 atol = 1
 
+        dims_ek1 = 2 .^ (3:6)
+        times_ek1 = [t(d, EK1) for d in dims_ek1]
         lr_ek1 = linregress(log.(dims_ek1), log.(times_ek1))
-        @test_skip slope(lr_ek1)[1] ≈ 2 atol = 0.2
-        # This is what we would actually expect, not sure what's going wrong:
-        @test_skip slope(lr_ek1)[1] ≈ 3 atol = 0.1
+        @test slope(lr_ek1)[1] ≈ 3 atol = 1
+
+        dims_dek1 = 2 .^ (4:10)
+        times_dek1 = [t(d, DiagonalEK1) for d in dims_dek1]
+        lr_dek1 = linregress(log.(dims_dek1), log.(times_dek1))
+        @test slope(lr_dek1)[1] ≈ 1 atol = 1
     end
 
     @testset "Order 3 + Taylor-init + no smoothing" begin
-        time_dim(d; Alg) = begin
-            _prob = remake(prob, u0=ones(d))
-            tmin = Inf
-            for _ in 1:NUMRUNS
-                integ = init(_prob, Alg(smooth=false),
-                    dense=false, save_everystep=false,
-                    adaptive=false, dt=1e-2)
-                t = @elapsed solve!(integ)
-                tmin = min(tmin, t)
-            end
-            return tmin
-        end
+        t(d, Alg) = _timer(d, Alg(smooth=false); dense=false, save_everystep=false)
 
         dims_ek0 = 2 .^ (8:15)
-        times_ek0 = [time_dim(d; Alg=EK0) for d in dims_ek0]
-        dims_ek1 = 2 .^ (2:5)
-        times_ek1 = [time_dim(d; Alg=EK1) for d in dims_ek1]
-
+        times_ek0 = [t(d, EK0) for d in dims_ek0]
         lr_ek0 = linregress(log.(dims_ek0), log.(times_ek0))
-        @test_skip slope(lr_ek0)[1] ≈ 1 atol = 0.1
-        @test 0.5 < slope(lr_ek0)[1] < 1.3
+        @test slope(lr_ek0)[1] ≈ 1 atol = 1
 
+        dims_ek1 = 2 .^ (3:6)
+        times_ek1 = [t(d, EK1) for d in dims_ek1]
         lr_ek1 = linregress(log.(dims_ek1), log.(times_ek1))
-        @test_skip slope(lr_ek1)[1] ≈ 2 atol = 0.5
-        # This is what we would actually expect, not sure what's going wrong:
-        @test_skip slope(lr_ek1)[1] ≈ 3 atol = 0.1
+        @test slope(lr_ek1)[1] ≈ 3 atol = 1
+
+        dims_dek1 = 2 .^ (4:10)
+        times_dek1 = [t(d, DiagonalEK1) for d in dims_dek1]
+        lr_dek1 = linregress(log.(dims_dek1), log.(times_dek1))
+        @test slope(lr_dek1)[1] ≈ 1 atol = 1
     end
 
     @testset "Order 3 with smoothing and everyting" begin
-        time_dim(d; Alg) = begin
-            _prob = remake(prob, u0=ones(d))
-            tmin = Inf
-            for _ in 1:NUMRUNS
-                integ = init(_prob, Alg(), adaptive=false, dt=1e-2)
-                t = @elapsed solve!(integ)
-                tmin = min(tmin, t)
-            end
-            return tmin
-        end
+        t(d, Alg) = _timer(d, Alg())
 
         dims_ek0 = 2 .^ (8:13)
-        times_ek0 = [time_dim(d; Alg=EK0) for d in dims_ek0]
-        dims_ek1 = 2 .^ (1:4)
-        times_ek1 = [time_dim(d; Alg=EK1) for d in dims_ek1]
-
+        times_ek0 = [t(d, EK0) for d in dims_ek0]
         lr_ek0 = linregress(log.(dims_ek0), log.(times_ek0))
-        @test 0.5 < slope(lr_ek0)[1] < 1.3
+        @test slope(lr_ek0)[1] ≈ 1 atol = 1
 
+        dims_ek1 = 2 .^ (3:6)
+        times_ek1 = [t(d, EK1) for d in dims_ek1]
         lr_ek1 = linregress(log.(dims_ek1), log.(times_ek1))
-        @test_skip slope(lr_ek1)[1] ≈ 2 atol = 0.2
-        # This is what we would actually expect, not sure what's going wrong:
-        @test_skip slope(lr_ek1)[1] ≈ 3 atol = 0.1
+        @test slope(lr_ek1)[1] ≈ 3 atol = 1
+
+        dims_dek1 = 2 .^ (4:10)
+        times_dek1 = [t(d, DiagonalEK1) for d in dims_dek1]
+        lr_dek1 = linregress(log.(dims_dek1), log.(times_dek1))
+        @test slope(lr_dek1)[1] ≈ 1 atol = 1
     end
 end
diff --git a/test/core/blockdiagonals.jl b/test/core/blockdiagonals.jl
new file mode 100644
index 000000000..c0969571b
--- /dev/null
+++ b/test/core/blockdiagonals.jl
@@ -0,0 +1,85 @@
+using ProbNumDiffEq
+import ProbNumDiffEq as PNDE
+import ProbNumDiffEq: BlockDiag, _matmul!
+using LinearAlgebra
+using BlockDiagonals
+using Test
+
+d1, d2 = 2, 3
+D = d1 * d2
+@testset "T=$T" for T in (Float64, BigFloat)
+    A = BlockDiag([randn(T, d1, d1) for _ in 1:d2])
+    B = BlockDiag([randn(T, d1, d1) for _ in 1:d2])
+    C = BlockDiag([randn(T, d1, d1) for _ in 1:d2])
+    alpha = rand(T)
+    beta = rand(T)
+
+    AM, BM, CM = @test_nowarn Matrix.((A, B, C))
+
+    @test Matrix(BlockDiagonal(A)) == AM
+    @test Matrix(BlockDiagonal(B)) == BM
+    @test Matrix(BlockDiagonal(C)) == CM
+
+    _A = @test_nowarn copy(A)
+    @test _A isa BlockDiag
+
+    _B = @test_nowarn copy!(_A, B)
+    @test _B === _A
+    @test _B == B
+
+    _A = @test_nowarn similar(A)
+    @test _A isa BlockDiag
+    @test size(_A) == size(A)
+
+    _Z = @test_nowarn zero(A)
+    @test _Z isa BlockDiag
+    @test size(_Z) == size(A)
+    @test all(_Z .== 0)
+
+    function tttm(M) # quick type test and to matrix
+        @test M isa BlockDiag
+        return Matrix(M)
+    end
+
+    @test tttm(mul!(C, A, B)) ≈ mul!(CM, AM, BM)
+    @test tttm(mul!(C, A', B)) ≈ mul!(CM, AM', BM)
+    @test tttm(mul!(C, A, B')) ≈ mul!(CM, AM, BM')
+    @test tttm(_matmul!(C, A, B)) ≈ _matmul!(CM, AM, BM)
+    @test tttm(_matmul!(C, A', B)) ≈ _matmul!(CM, AM', BM)
+    @test tttm(_matmul!(C, A, B')) ≈ _matmul!(CM, AM, BM')
+
+    @test tttm(mul!(C, A, B, alpha, beta)) ≈ mul!(CM, AM, BM, alpha, beta)
+    @test tttm(_matmul!(C, A, B, alpha, beta)) ≈ _matmul!(CM, AM, BM, alpha, beta)
+
+    @test tttm(A * B) ≈ AM * BM
+    @test tttm(A' * B) ≈ AM' * BM
+    @test tttm(A * B') ≈ AM * BM'
+
+    @test tttm(A * alpha) ≈ AM * alpha
+    @test tttm(alpha * A) ≈ alpha * AM
+    @test tttm(A * (alpha * I)) ≈ AM * alpha
+    @test tttm((alpha * I) * A) ≈ alpha * AM
+    @test tttm(rmul!(copy(A), alpha)) ≈ alpha * AM
+    @test tttm(mul!(_A, alpha, A)) ≈ alpha * AM
+    @test tttm(mul!(_A, A, alpha)) ≈ alpha * AM
+    @test tttm(_matmul!(_A, alpha, A)) ≈ alpha * AM
+    @test tttm(_matmul!(_A, A, alpha)) ≈ alpha * AM
+
+    @test tttm((alpha * I(D)) * A) ≈ alpha * AM
+    @test tttm(A * (alpha * I(D))) ≈ AM * alpha
+    @test tttm(mul!(_A, A, alpha * I(D))) ≈ alpha * AM
+    @test tttm(mul!(_A, alpha * I(D), A)) ≈ alpha * AM
+    @test tttm(_matmul!(_A, A, alpha * I(D))) ≈ alpha * AM
+    @test tttm(_matmul!(_A, alpha * I(D), A)) ≈ alpha * AM
+
+    @test_throws ErrorException view(A, 1:2, 1:2)
+
+    tttm(copy!(copy(A), Diagonal(A)))
+
+    @test tttm(A + A) ≈ AM + AM
+    @test tttm(A - A) ≈ AM - AM
+
+    _A = copy(A)
+    @test tttm(PNDE.add!(_A, A)) == AM + AM
+    @test Matrix(_A) == AM + AM
+end
diff --git a/test/core/diffusions.jl b/test/core/diffusions.jl
new file mode 100644
index 000000000..2d7fb59de
--- /dev/null
+++ b/test/core/diffusions.jl
@@ -0,0 +1,59 @@
+using ProbNumDiffEq
+import ProbNumDiffEq as PNDE
+using Test
+using LinearAlgebra
+using FillArrays
+
+d, q = 2, 3
+T = Float64
+
+@testset "$diffusionmodel" for diffusionmodel in (
+    DynamicDiffusion(),
+    DynamicMVDiffusion(),
+    FixedDiffusion(),
+    FixedDiffusion(calibrate=false),
+    FixedMVDiffusion(),
+    FixedMVDiffusion(; initial_diffusion=rand(d)),
+    FixedMVDiffusion(; initial_diffusion=Diagonal(rand(d))),
+    FixedMVDiffusion(; initial_diffusion=Diagonal(rand(d)), calibrate=false),
+)
+
+    # Test the initial diffusion
+    diffusion = PNDE.initial_diffusion(diffusionmodel, d, q, T)
+    @assert size(diffusion) == (d, d)
+    @assert diffusion isa Diagonal
+    if !(diffusionmodel isa FixedMVDiffusion || diffusionmodel isa DynamicMVDiffusion)
+        @assert diffusion isa Diagonal{T,<:Fill}
+    end
+
+    # Test applying the diffusion
+    _, Q = PNDE.discretize(PNDE.IWP{T}(d, q), 0.1)
+    Qmat = PSDMatrix(Matrix(Q.R))
+    _diffusion = rand() * diffusion
+    @testset "$FAC" for FAC in (
+        PNDE.DenseCovariance{T}(d, q),
+        PNDE.BlockDiagonalCovariance{T}(d, q),
+        PNDE.IsometricKroneckerCovariance{T}(d, q),
+    )
+        if diffusion isa Diagonal{T,<:Vector} && FAC isa PNDE.IsometricKroneckerCovariance
+            continue
+        end
+
+        _Q = PNDE.to_factorized_matrix(FAC, Q)
+        Qdiff = @test_nowarn PNDE.apply_diffusion(_Q, _diffusion)
+        Qmatdiff = @test_nowarn PNDE.apply_diffusion(Qmat, _diffusion)
+        @test Qdiff == Qmatdiff
+
+        Qdiff = @test_nowarn PNDE.apply_diffusion!(copy(_Q), _diffusion)
+        @test Qdiff == Qmatdiff
+
+        Qdiff = @test_nowarn PNDE.apply_diffusion!(copy(_Q), _Q, _diffusion)
+        @test Qdiff == Qmatdiff
+    end
+
+    @testset "Calibration" begin
+        # MLE
+        # At their core, they all just compute z' * inv(S) * z
+        # and then do something with the result
+    end
+end
diff --git a/test/core/fast_linalg.jl b/test/core/fast_linalg.jl
new file mode 100644
index 000000000..2363f553f
--- /dev/null
+++ b/test/core/fast_linalg.jl
@@ -0,0 +1,52 @@
+using ProbNumDiffEq
+import ProbNumDiffEq: _matmul!
+using LinearAlgebra
+using Test
+
+@testset "T=$T" for T in (Float64, BigFloat)
+    A = rand(T, 2, 3)
+    B = rand(T, 3, 4)
+    C = rand(T, 2, 4)
+    alpha = rand(T)
+    beta = rand(T)
+
+    @test _matmul!(C, A, B) == mul!(C, A, B)
+    @test _matmul!(C, A, B, alpha, beta) == mul!(C, A, B, alpha, beta)
+
+    _B = copy(B)
+    @test _matmul!(_B, alpha, B) == mul!(_B, alpha, B)
+    @test _matmul!(_B, B, beta) == mul!(_B, B, beta)
+
+    # Diagonals
+    D = Diagonal(rand(T, size(B, 1)))
+    @test _matmul!(_B, D, B) == mul!(_B, D, B)
+    @test _matmul!(_B, D, B, alpha, beta) == mul!(_B, D, B, alpha, beta)
+    D = Diagonal(rand(T, size(B, 2)))
+    @test _matmul!(_B, B, D) == mul!(_B, B, D)
+    @test _matmul!(_B, B, D, alpha, beta) == mul!(_B, B, D, alpha, beta)
+    CD, D1, D2 = rand(T, 3, 3), Diagonal(rand(T, 3)), Diagonal(rand(T, 3))
+    @test _matmul!(CD, D1, D2) == _matmul!(CD, D1, D2)
+
+    # Triangulars
+    ASQ, BSQ, CSQ = rand(T, 2, 2), rand(T, 2, 2), rand(T, 2, 2)
+    ALT, AUT = LowerTriangular(ASQ), UpperTriangular(ASQ)
+    BLT, BUT = LowerTriangular(BSQ), UpperTriangular(BSQ)
+    @test _matmul!(CSQ, ALT, BSQ) == mul!(CSQ, ALT, BSQ)
+    @test _matmul!(CSQ, AUT, BSQ) == mul!(CSQ, AUT, BSQ)
+    @test _matmul!(CSQ, ASQ, BLT) == mul!(CSQ, ASQ, BLT)
+    @test _matmul!(CSQ, ASQ, BUT) == mul!(CSQ, ASQ, BUT)
+    @test _matmul!(CSQ, ALT, BUT) == mul!(CSQ, ALT, BUT)
+    @test _matmul!(CSQ, AUT, BLT) == mul!(CSQ, AUT, BLT)
+    @test _matmul!(CSQ, ALT, BLT) == mul!(CSQ, ALT, BLT)
+    @test _matmul!(CSQ, AUT, BUT) == mul!(CSQ, AUT, BUT)
+
+    # Adjoints
+    AT = Matrix(A')'
+    @test _matmul!(C, AT, B) == mul!(C, AT, B)
+    BT = Matrix(B')'
+    @test _matmul!(C, A, BT) == mul!(C, A, BT)
+
+    # Vectors
+    CV, BV = rand(T, 2), rand(T, 3)
+    @test _matmul!(CV, A, BV) == mul!(CV, A, BV)
+end
diff --git a/test/core/filtering.jl b/test/core/filtering.jl
index d570e2eed..9da066b22 100644
--- a/test/core/filtering.jl
+++ b/test/core/filtering.jl
@@ -5,53 +5,53 @@ Check the correctness of the filtering implementations vs. basic readable math c
 using Test
 using ProbNumDiffEq
 using LinearAlgebra
-import ProbNumDiffEq: IsometricKroneckerProduct
+import ProbNumDiffEq: IsometricKroneckerProduct, BlockDiag
 import ProbNumDiffEq as PNDE
+using BlockDiagonals
+using FillArrays
+import ProbNumDiffEq.GaussianDistributions: logpdf
 
 @testset "PREDICT" begin
     # Setup
-    d = 5
-    m = rand(d)
-    P_R = Matrix(UpperTriangular(rand(d, d)))
-    P = P_R'P_R
+    d = 2
+    q = 2
+    D = d * (q + 1)
+    m = rand(D)
 
-    A = rand(d, d)
-    Q_R = Matrix(UpperTriangular(rand(d, d)))
-    Q = Q_R'Q_R
+    _P_R = IsometricKroneckerProduct(d, Matrix(UpperTriangular(rand(q + 1, q + 1))))
+    _P = _P_R'_P_R
+    PM = Matrix(_P)
 
-    # PREDICT
-    m_p = A * m
-    P_p = A * P * A' + Q
+    _A = IsometricKroneckerProduct(d, rand(q + 1, q + 1))
+    AM = Matrix(_A)
 
-    x_curr = Gaussian(m, P)
-    x_out = copy(x_curr)
+    _Q_R = IsometricKroneckerProduct(d, Matrix(UpperTriangular(rand(q + 1, q + 1))))
+    _Q = _Q_R'_Q_R
+    QM = Matrix(_Q)
 
-    C_DxD = zeros(d, d)
-    C_2DxD = zeros(2d, d)
-    C_3DxD = zeros(3d, d)
+    # PREDICT
+    m_p = AM * m
+    P_p = AM * PM * AM' + QM
 
-    _fstr(F) = F ? "Kronecker" : "None"
-    @testset "Factorization: $(_fstr(KRONECKER))" for KRONECKER in (false, true)
-        if KRONECKER
-            K = 2
-            m = kron(ones(K), m)
-            P_R = IsometricKroneckerProduct(K, P_R)
-            P = P_R'P_R
+    @testset "Factorization: $_FAC" for _FAC in (
+        PNDE.DenseCovariance,
+        PNDE.BlockDiagonalCovariance,
+        PNDE.IsometricKroneckerCovariance,
+    )
+        FAC = _FAC{Float64}(d, q)
 
-            A = IsometricKroneckerProduct(K, A)
-            Q_R = IsometricKroneckerProduct(K, Q_R)
-            Q = Q_R'Q_R
+        P_R = PNDE.to_factorized_matrix(FAC, _P_R)
+        P = P_R'P_R
+        A = PNDE.to_factorized_matrix(FAC, _A)
+        Q_R = PNDE.to_factorized_matrix(FAC, _Q_R)
+        Q = Q_R'Q_R
 
-            m_p = A * m
-            P_p = A * P * A' + Q
+        x_curr = Gaussian(m, P)
+        x_out = copy(x_curr)
 
-            x_curr = Gaussian(m, P)
-            x_out = copy(x_curr)
-
-            C_DxD = IsometricKroneckerProduct(K, C_DxD)
-            C_2DxD = IsometricKroneckerProduct(K, C_2DxD)
-            C_3DxD = IsometricKroneckerProduct(K, C_3DxD)
-        end
+        C_DxD = PNDE.factorized_zeros(FAC, D, D)
+        C_2DxD = PNDE.factorized_zeros(FAC, 2D, D)
+        C_3DxD = PNDE.factorized_zeros(FAC, 3D, D)
 
         @testset "predict" begin
             x_out = ProbNumDiffEq.predict(x_curr, A, Q)
@@ -68,6 +68,28 @@ import ProbNumDiffEq as PNDE
             @test P_p ≈ Matrix(x_out.Σ)
         end
 
+        @testset "predict! with PSDMatrix and diffusion" begin
+            for diffusion in (rand(), rand() * Eye(d), rand() * I(d), Diagonal(rand(d)))
+                if _FAC == PNDE.IsometricKroneckerCovariance &&
+                   !(
+                    diffusion isa Number ||
+                    diffusion isa Diagonal{<:Number,<:FillArrays.Fill}
+                )
+                    continue
+                end
+                _diffusions = diffusion isa Number ? diffusion * Ones(d) : diffusion.diag
+
+                QM_diff = Matrix(BlockDiagonal([σ² * _Q.B for σ² in _diffusions]))
+                P_p_diff = AM * PM * AM' + QM_diff
+
+                x_curr = Gaussian(m, PSDMatrix(P_R))
+                x_out = copy(x_curr)
+                Q_SR = PSDMatrix(Q_R)
+                ProbNumDiffEq.predict!(x_out, x_curr, A, Q_SR, C_DxD, C_2DxD, diffusion)
+                @test P_p_diff ≈ Matrix(x_out.Σ)
+            end
+        end
+
         @testset "predict! with zero diffusion" begin
             x_curr = Gaussian(m, PSDMatrix(P_R))
             x_out = copy(x_curr)
@@ -81,8 +103,10 @@ import ProbNumDiffEq as PNDE
             x_curr = Gaussian(m, PSDMatrix(P_R))
             x_out = copy(x_curr)
             # marginalize! needs tall square-roots:
-            Q_SR = if KRONECKER
+            Q_SR = if Q_R isa IsometricKroneckerProduct
                 PSDMatrix(IsometricKroneckerProduct(Q_R.ldim, [Q_R.B; zero(Q_R.B)]))
+            elseif Q_R isa BlockDiag
+                PSDMatrix(BlockDiag([[B; zero(B)] for B in Q_R.blocks]))
             else
                 PSDMatrix([Q_R; zero(Q_R)])
             end
@@ -98,60 +122,67 @@ end
 @testset "UPDATE" begin
     # Setup
     d = 5
+    o = 1
+
     m_p = rand(d)
-    P_p_R = Matrix(UpperTriangular(rand(d, d)))
-    P_p = P_p_R'P_p_R
+    _P_p_R = IsometricKroneckerProduct(o, Matrix(UpperTriangular(rand(d, d))))
+    _P_p = _P_p_R'_P_p_R
+    P_p_M = Matrix(_P_p)
 
     # Measure
-    o = 1
     _HB = rand(1, d)
-    H = kron(I(o), _HB)
+    _H = IsometricKroneckerProduct(o, _HB)
+    HM = Matrix(_H)
     R = zeros(o, o)
 
     z_data = zeros(o)
-    z = H * m_p
-    SR = P_p_R * H'
-    S = Symmetric(SR'SR)
-
-    # UPDATE
-    S_inv = inv(S)
-    K = P_p * H' * S_inv
-    m = m_p + K * (z_data .- z)
-    P = P_p - K * S * K'
-
-    x_pred = Gaussian(m_p, P_p)
-    x_out = copy(x_pred)
-    measurement = Gaussian(z, S)
+    z = _H * m_p
+    _SR = _P_p_R * _H'
+    _S = _SR'_SR
+    SM = Matrix(_S)
 
-    C_dxd = zeros(o, o)
-    C_d = zeros(o)
-    C_Dxd = zeros(d, o)
-    C_DxD = zeros(d, d)
-    C_2DxD = zeros(2d, d)
-    C_3DxD = zeros(3d, d)
+    LL = logpdf(Gaussian(z, SM), z_data)
 
-    _fstr(F) = F ? "Kronecker" : "None"
-    @testset "Factorization: $(_fstr(KRONECKER))" for KRONECKER in (false, true)
-        if KRONECKER
-            P_p_R = IsometricKroneckerProduct(1, P_p_R)
-            P_p = P_p_R'P_p_R
-
-            H = IsometricKroneckerProduct(1, _HB)
-            R = zeros(o, o)
-
-            SR = IsometricKroneckerProduct(1, SR)
-            S = SR'SR
-
-            x_pred = Gaussian(m_p, P_p)
-            x_out = copy(x_pred)
-            measurement = Gaussian(z, S)
-
-            C_dxd = IsometricKroneckerProduct(1, C_dxd)
-            C_Dxd = IsometricKroneckerProduct(1, C_Dxd)
-            C_DxD = IsometricKroneckerProduct(1, C_DxD)
-            C_2DxD = IsometricKroneckerProduct(1, C_2DxD)
-            C_3DxD = IsometricKroneckerProduct(1, C_3DxD)
-        end
+    # UPDATE
+    KM = P_p_M * HM' / SM
+    m = m_p + KM * (z_data .- z)
+    P = P_p_M - KM * SM * KM'
+
+    _R_R = rand(o, o)
+    _R = _R_R'_R_R
+    _SR_noisy = qr([_P_p_R * _H'; _R_R]).R |> Matrix
+    _S_noisy = _SR_noisy'_SR_noisy
+    SM_noisy = Matrix(_S_noisy)
+
+    KM_noisy = P_p_M * HM' / SM_noisy
+    m_noisy = m_p + KM_noisy * (z_data .- z)
+    P_noisy = P_p_M - KM_noisy * SM_noisy * KM_noisy'
+
+    @testset "Factorization: $_FAC" for _FAC in (
+        PNDE.DenseCovariance,
+        PNDE.BlockDiagonalCovariance,
+        PNDE.IsometricKroneckerCovariance,
+    )
+        FAC = _FAC{Float64}(o, d)
+
+        C_dxd = PNDE.factorized_zeros(FAC, o, o)
+        C_d = zeros(o)
+        C_Dxd = PNDE.factorized_zeros(FAC, d, o)
+        C_DxD = PNDE.factorized_zeros(FAC, d, d)
+        C_2DxD = PNDE.factorized_zeros(FAC, 2d, d)
+        C_3DxD = PNDE.factorized_zeros(FAC, 3d, d)
+
+        P_p_R = PNDE.to_factorized_matrix(FAC, _P_p_R)
+        P_p = P_p_R'P_p_R
+
+        H = PNDE.to_factorized_matrix(FAC, _H)
+
+        SR = PNDE.to_factorized_matrix(FAC, _SR)
+        S = SR'SR
+
+        x_pred = Gaussian(m_p, P_p)
+        x_out = copy(x_pred)
+        measurement = Gaussian(z, S)
 
         @testset "update" begin
             x_out = ProbNumDiffEq.update(x_pred, measurement, H)
@@ -170,7 +201,7 @@ end
                 z_cache = C_d
                 x_pred = Gaussian(x_pred.μ, PSDMatrix(P_p_R))
                 x_out = copy(x_pred)
-                ProbNumDiffEq.update!(
+                _, ll = ProbNumDiffEq.update!(
                     x_out,
                     x_pred,
                     msmnt,
@@ -183,6 +214,7 @@ end
                 )
                 @test m ≈ x_out.μ
                 @test P ≈ Matrix(x_out.Σ)
+                @test ll ≈ LL
             end
             @testset "Zero predicted covariance" begin
                 K_cache = copy(C_Dxd)
@@ -230,6 +262,8 @@ end
     SR = qr([P_p_R * H'; R_R]).R |> Matrix
     S = Symmetric(SR'SR)
 
+    LL = logpdf(Gaussian(z, S), z_data)
+
     # UPDATE
     S_inv = inv(S)
     K = P_p * H' * S_inv
@@ -288,7 +322,7 @@ end
                 z_cache = C_d
                 x_pred = Gaussian(x_pred.μ, PSDMatrix(P_p_R))
                 x_out = copy(x_pred)
-                ProbNumDiffEq.update!(
+                _, ll = ProbNumDiffEq.update!(
                     x_out,
                     x_pred,
                     msmnt,
@@ -302,6 +336,7 @@ end
                 )
                 @test m ≈ x_out.μ
                 @test P ≈ Matrix(x_out.Σ)
+                @test ll ≈ LL
             end
             @testset "Zero predicted covariance" begin
                 K_cache = copy(C_Dxd)
@@ -334,57 +369,63 @@ end
 @testset "SMOOTH" begin
     # Setup
     d = 5
-    m, m_s = rand(d), rand(d)
-    P_R, P_s_R = Matrix(UpperTriangular(rand(d, d))), Matrix(UpperTriangular(rand(d, d)))
-    P, P_s = P_R'P_R, P_s_R'P_s_R
+    q = 2
+    D = d * (q + 1)
+
+    m, m_s = rand(D), rand(D)
+    _P_R = IsometricKroneckerProduct(d, Matrix(UpperTriangular(rand(q + 1, q + 1))))
+    _P_s_R = IsometricKroneckerProduct(d, Matrix(UpperTriangular(rand(q + 1, q + 1))))
+    _P, _P_s = _P_R'_P_R, _P_s_R'_P_s_R
+    PM, P_sM = Matrix(_P), Matrix(_P_s)
 
-    A = rand(d, d)
-    Q_R = Matrix(UpperTriangular(rand(d, d))) + I
-    Q = Q_R'Q_R
-    Q_SR = PSDMatrix(Q_R)
+    _A = IsometricKroneckerProduct(d, rand(q + 1, q + 1))
+    AM = Matrix(_A)
+    _Q_R = IsometricKroneckerProduct(d, Matrix(UpperTriangular(rand(q + 1, q + 1)) + I))
+    _Q = _Q_R'_Q_R
+    _Q_SR = PSDMatrix(_Q_R)
 
     # PREDICT first
-    m_p = A * m
-    P_p_R = qr([P_R * A'; Q_R]).R |> Matrix
-    P_p = A * P * A' + Q
-    @assert P_p ≈ P_p_R'P_p_R
+    m_p = AM * m
+    _P_p_R = IsometricKroneckerProduct(d, qr([_P_R.B * _A.B'; _Q_R.B]).R |> Matrix)
+    _P_p = _A * _P * _A' + _Q
+    @assert _P_p ≈ _P_p_R'_P_p_R
+    P_pM = Matrix(_P_p)
 
     # SMOOTH
-    G = P * A' * inv(P_p)
+    G = _P * _A' * inv(_P_p) |> Matrix
     m_smoothed = m + G * (m_s - m_p)
-    P_smoothed = P + G * (P_s - P_p) * G'
+    P_smoothed = PM + G * (P_sM - P_pM) * G'
 
-    x_curr = Gaussian(m, P)
-    x_next = Gaussian(m_s, P_s)
     x_smoothed = Gaussian(m_smoothed, P_smoothed)
 
-    _fstr(F) = F ? "Kronecker" : "None"
-    @testset "Factorization: $(_fstr(KRONECKER))" for KRONECKER in (false, true)
-        K = 2
-        if KRONECKER
-            P_R = IsometricKroneckerProduct(K, P_R)
-            P = P_R'P_R
+    @testset "Factorization: $_FAC" for _FAC in (
+        PNDE.DenseCovariance,
+        PNDE.BlockDiagonalCovariance,
+        PNDE.IsometricKroneckerCovariance,
+    )
+        FAC = _FAC{Float64}(d, q)
 
-            P_s_R = IsometricKroneckerProduct(K, P_s_R)
-            P_s = P_s_R'P_s_R
-
-            P_p_R = IsometricKroneckerProduct(K, P_p_R)
-            P_p = P_p_R'P_p_R
+        P_R = PNDE.to_factorized_matrix(FAC, _P_R)
+        P = P_R'P_R
+        P_s_R = PNDE.to_factorized_matrix(FAC, _P_s_R)
+        P_s = P_s_R'P_s_R
+        P_p_R = PNDE.to_factorized_matrix(FAC, _P_p_R)
+        P_p = P_p_R'P_p_R
 
-            m, m_s, m_p = kron(ones(K), m), kron(ones(K), m_s), kron(ones(K), m_p)
+        x_curr = Gaussian(m, P)
+        x_next = Gaussian(m_s, P_s)
 
-            A = IsometricKroneckerProduct(K, A)
-            Q_R = IsometricKroneckerProduct(K, Q_R)
-            Q = Q_R'Q_R
-            Q_SR = PSDMatrix(Q_R)
+        A = PNDE.to_factorized_matrix(FAC, _A)
+        Q_R = PNDE.to_factorized_matrix(FAC, _Q_R)
+        Q = Q_R'Q_R
+        Q_SR = PSDMatrix(Q_R)
 
-            x_curr = Gaussian(m, P)
-            x_next = Gaussian(m_s, P_s)
+        x_curr = Gaussian(m, P)
+        x_next = Gaussian(m_s, P_s)
 
-            m_smoothed = kron(ones(K), m_smoothed)
-            P_smoothed = IsometricKroneckerProduct(K, P_smoothed)
-            x_smoothed = Gaussian(m_smoothed, P_smoothed)
-        end
+        C_DxD = PNDE.factorized_zeros(FAC, D, D)
+        C_2DxD = PNDE.factorized_zeros(FAC, 2D, D)
+        C_3DxD = PNDE.factorized_zeros(FAC, 3D, D)
 
         @testset "smooth" begin
             x_out, _ = ProbNumDiffEq.smooth(x_curr, x_next, A, Q)
@@ -401,22 +442,12 @@ end
         @testset "smooth via backward kernels" begin
             K_forward = ProbNumDiffEq.AffineNormalKernel(copy(A), copy(Q_SR))
             K_backward = ProbNumDiffEq.AffineNormalKernel(
-                copy(A), copy(m_p),
-                if KRONECKER
-                    PSDMatrix(IsometricKroneckerProduct(K, zeros(2d, d)))
-                else
-                    PSDMatrix(zeros(2d, d))
-                end)
+                copy(A), copy(m_p), PSDMatrix(copy(C_2DxD)))
 
             x_curr = Gaussian(m, PSDMatrix(P_R)) |> copy
             x_next_pred = Gaussian(m_p, PSDMatrix(P_p_R)) |> copy
             x_next_smoothed = Gaussian(m_s, PSDMatrix(P_s_R)) |> copy
 
-            C_DxD = if KRONECKER
-                IsometricKroneckerProduct(K, zeros(d, d))
-            else
-                zeros(d, d)
-            end
             ProbNumDiffEq.compute_backward_kernel!(
                 K_backward, x_next_pred, x_curr, K_forward; C_DxD)
 
@@ -427,11 +458,6 @@ end
             @test K_backward.b ≈ b
             @test Matrix(K_backward.C) ≈ Λ
 
-            C_3DxD = if KRONECKER
-                IsometricKroneckerProduct(K, zeros(3d, d))
-            else
-                zeros(3d, d)
-            end
             ProbNumDiffEq.marginalize_mean!(x_curr.μ, x_next_smoothed.μ, K_backward)
             ProbNumDiffEq.marginalize_cov!(
                 x_curr.Σ,
@@ -454,6 +480,32 @@ end
                 @test K2.b == K_backward.b
                 @test K2.C == K_backward.C
             end
+
+            @testset "smooth via backward kernels with diffusion $diffusion" for diffusion in
+                                                                                 (
+                rand(), rand() * Eye(d), rand() * I(d), Diagonal(rand(d)),
+            )
+                if _FAC == PNDE.IsometricKroneckerCovariance &&
+                   !(
+                    diffusion isa Number ||
+                    diffusion isa Diagonal{<:Number,<:FillArrays.Fill}
+                )
+                    continue
+                end
+                _diffusions =
+                    diffusion isa Number ? diffusion * Ones(d) : diffusion.diag
+                QM_diff = Matrix(BlockDiagonal([σ² * _Q.B for σ² in _diffusions]))
+
+                ProbNumDiffEq.compute_backward_kernel!(
+                    K_backward, x_next_pred, x_curr, K_forward; C_DxD, diffusion)
+
+                G = Matrix(x_curr.Σ) * Matrix(A)' * inv(Matrix(x_next_pred.Σ))
+                b = x_curr.μ - G * x_next_pred.μ
+                Λ = (I - G * AM) * Matrix(x_curr.Σ) * (I - G * AM)' + G * QM_diff * G'
+                @test K_backward.A ≈ G
+                @test K_backward.b ≈ b
+                @test Matrix(K_backward.C) ≈ Λ
+            end
         end
     end
 end
diff --git a/test/core/kronecker.jl b/test/core/kronecker.jl
index 982654da7..5e8ad085c 100644
--- a/test/core/kronecker.jl
+++ b/test/core/kronecker.jl
@@ -14,11 +14,27 @@ q = 2
     K2 = PNDE.IsometricKroneckerProduct(d, R2)
     M2 = Matrix(K2)
 
+    # Base
+    K3 = PNDE.IsometricKroneckerProduct(d, copy(R2))
+    M3 = Matrix(K3)
+    @test similar(K1) isa PNDE.IsometricKroneckerProduct
+    @test copy(K1) isa PNDE.IsometricKroneckerProduct
+    @test copy!(K3, K1) isa PNDE.IsometricKroneckerProduct
+    @test K3 == K1
+    @test size(K1) == size(M1)
+
+    function tttm(M) # quick type test and to matrix
+        @test M isa PNDE.IsometricKroneckerProduct
+        return Matrix(M)
+    end
+
     # Matrix-Matrix Operations
-    @test K1 * K2 ≈ M1 * M2
-    @test K1 * K2 isa PNDE.IsometricKroneckerProduct
-    @test K1 + K2 ≈ M1 + M2
-    @test K1 + K2 isa PNDE.IsometricKroneckerProduct
+    @test tttm(K1 * K2) ≈ M1 * M2
+    @test tttm(K1 + K2) ≈ M1 + M2
+
+    _K1 = copy(K1)
+    @test tttm(PNDE.add!(_K1, K2)) ≈ M1 + M2
+    @test _K1 ≈ M1 + M2
 
     # DimensionMismatch
     X = PNDE.IsometricKroneckerProduct(d, rand(T, 1, 1))
@@ -31,38 +47,21 @@ q = 2
     R4 = rand(T, q + 1)
     K4 = PNDE.IsometricKroneckerProduct(d, R4)
     M4 = Matrix(K4)
-    @test K1 * K4 ≈ M1 * M4
+    @test tttm(K1 * K4) ≈ M1 * M4
     @test_throws DimensionMismatch K1 + K4
 
     # UniformScaling
-    @test I + K1 ≈ I + M1
-    @test I + K1 isa PNDE.IsometricKroneckerProduct
-    @test K1 + I ≈ M1 + I
-    @test K1 + I isa PNDE.IsometricKroneckerProduct
-    @test I - K1 ≈ I - M1
-    @test I - K1 isa PNDE.IsometricKroneckerProduct
-    @test K1 - I ≈ M1 - I
-    @test K1 - I isa PNDE.IsometricKroneckerProduct
+    @test tttm(I + K1) ≈ I + M1
+    @test tttm(K1 + I) ≈ M1 + I
+    @test tttm(I - K1) ≈ I - M1
+    @test tttm(K1 - I) ≈ M1 - I
 
     # Other LinearAlgebra
-    @test K1' ≈ M1'
-    @test K1' isa PNDE.IsometricKroneckerProduct
-    @test inv(K1) ≈ inv(M1)
-    @test inv(K1) isa PNDE.IsometricKroneckerProduct
+    @test tttm(K1') ≈ M1'
+    @test tttm(inv(K1)) ≈ inv(M1)
     @test det(K1) ≈ det(M1)
-    @test K1 / K2 ≈ M1 / M2
-    @test K1 / K2 isa PNDE.IsometricKroneckerProduct
-    @test K1 \ K2 ≈ M1 \ M2
-    @test K1 \ K2 isa PNDE.IsometricKroneckerProduct
-
-    # Base
-    K3 = PNDE.IsometricKroneckerProduct(d, copy(R2))
-    M3 = Matrix(K3)
-    @test similar(K1) isa PNDE.IsometricKroneckerProduct
-    @test copy(K1) isa PNDE.IsometricKroneckerProduct
-    @test copy!(K3, K1) isa PNDE.IsometricKroneckerProduct
-    @test K3 == K1
-    @test size(K1) == size(M1)
+    @test tttm(K1 / K2) ≈ M1 / M2
+    @test tttm(K1 \ K2) ≈ M1 \ M2
 
     # Base
     @test one(K1) isa PNDE.IsometricKroneckerProduct
@@ -72,10 +71,13 @@ q = 2
 
     # Matrix-Scalar
     α = 2.0
-    @test α * K1 ≈ α * M1
-    @test α * K1 isa PNDE.IsometricKroneckerProduct
-    @test K1 * α ≈ α * M1
-    @test K1 * α isa PNDE.IsometricKroneckerProduct
+    @test tttm(α * K1) ≈ α * M1
+    @test tttm(K1 * α) ≈ α * M1
+    _K1 = copy(K1)
+    @test mul!(_K1, α, K1) == α * K1
+    @test mul!(_K1, K1, α) == α * K1
+    @test PNDE._matmul!(_K1, K1, α) == α * K1
+    @test PNDE._matmul!(_K1, α, K1) == α * K1
 
     # In-place Matrix-Matrix Multiplication
     β = -0.5
diff --git a/test/core/preconditioning.jl b/test/core/preconditioning.jl
index 41502fa6b..27c02cd62 100644
--- a/test/core/preconditioning.jl
+++ b/test/core/preconditioning.jl
@@ -2,10 +2,6 @@ using Test
 using LinearAlgebra
 using ProbNumDiffEq
 import ProbNumDiffEq as PNDE
-import ODEProblemLibrary: prob_ode_lotkavolterra
-
-prob = prob_ode_lotkavolterra
-prob = remake(prob, tspan=(0.0, 10.0))
 
 @testset "Condition numbers of A,Q" begin
     h = 0.1 * rand()
@@ -34,3 +30,22 @@ prob = remake(prob, tspan=(0.0, 10.0))
     @test cond(Matrix(Qh)) > cond(Matrix(Qh_p))
     @test cond(Matrix(Qh)) > cond(Matrix(Qh_p))^2
 end
+
+@testset "Covariance Factorizations all doing the correct thing" begin
+    h = rand()
+    d, q = 2, 3
+
+    function make_preconditioners(FAC)
+        P, PI = PNDE.init_preconditioner(FAC{Float64}(d, q))
+        PNDE.make_preconditioner!(P, h, d, q)
+        PNDE.make_preconditioner_inv!(PI, h, d, q)
+        return P, PI
+    end
+
+    PK, PIK = make_preconditioners(PNDE.IsometricKroneckerCovariance)
+    PD, PID = make_preconditioners(PNDE.DenseCovariance)
+    PB, PIB = make_preconditioners(PNDE.BlockDiagonalCovariance)
+
+    @test PK == PD == PB
+    @test PIK == PID == PIB
+end
diff --git a/test/core/priors.jl b/test/core/priors.jl
index dac7769f2..688374876 100644
--- a/test/core/priors.jl
+++ b/test/core/priors.jl
@@ -7,9 +7,9 @@ using FiniteHorizonGramians
 using Statistics
 using Plots
 using SimpleUnPack
+using FillArrays
 
 h = 0.1
-σ = 0.1
 
 @testset "General prior API" begin
     for prior in (
@@ -31,11 +31,6 @@ h = 0.1
         @test A1 ≈ A3
         @test Matrix(Q1) ≈ Q3
 
-        A4, Q4R = PNDE._discretize_sqrt_with_quadraturetrick(
-            PNDE.LTISDE(Matrix(sde.F), Matrix(sde.L)), h)
-        @test A1 ≈ A4
-        @test Q1.R ≈ Q4R
-
         ts = 0:0.1:1
         marginals = @test_nowarn PNDE.marginalize(prior, ts)
         @test length(marginals) == length(ts)
@@ -54,6 +49,8 @@ end
 @testset "Test IWP (d=2,q=2)" begin
     d, q = 2, 2
 
+    σ = 0.1
+
     prior = PNDE.IWP(dim=d, num_derivatives=q)
 
     # true sde parameters
@@ -122,41 +119,54 @@ end
 
     @testset "Test vanilla (ie. non-preconditioned)" begin
         Ah, Qh = PNDE.discretize(prior, h)
-        Qh = PNDE.apply_diffusion(Qh, σ^2)
-
         @test AH_22_IBM ≈ Ah
-        @test QH_22_IBM ≈ Matrix(Qh)
+
+        for Γ in (σ^2, σ^2 * Eye(d))
+            @test QH_22_IBM ≈ Matrix(PNDE.apply_diffusion(Qh, Γ))
+        end
     end
 
     @testset "Test with preconditioning" begin
         A, Q = PNDE.preconditioned_discretize(prior)
-        Qh = PNDE.apply_diffusion(Q, σ^2)
-
         @test AH_22_PRE ≈ Matrix(A)
-        @test QH_22_PRE ≈ Matrix(Qh)
+
+        for Γ in (σ^2, σ^2 * Eye(d))
+            @test QH_22_PRE ≈ Matrix(PNDE.apply_diffusion(Q, Γ))
+        end
     end
 
     @testset "Test `make_transition_matrices!`" begin
-        A, Q, Ah, Qh, P, PI = PNDE.initialize_transition_matrices(
-            PNDE.DenseCovariance{Float64}(d, q), prior, h)
-
-        @test AH_22_PRE ≈ A
-        @test QH_22_PRE ≈ Matrix(PNDE.apply_diffusion(Q, σ^2))
-
-        cache = (
-            d=d,
-            q=q,
-            A=A,
-            Q=Q,
-            P=P,
-            PI=PI,
-            Ah=Ah,
-            Qh=Qh,
-        )
-
-        make_transition_matrices!(cache, prior, h)
-        @test AH_22_IBM ≈ cache.Ah
-        @test QH_22_IBM ≈ Matrix(PNDE.apply_diffusion(cache.Qh, σ^2))
+        for FAC in (PNDE.IsometricKroneckerCovariance, PNDE.BlockDiagonalCovariance)
+            A, Q, Ah, Qh, P, PI = PNDE.initialize_transition_matrices(
+                PNDE.BlockDiagonalCovariance{Float64}(d, q), prior, h)
+
+            @test AH_22_PRE ≈ A
+
+            for Γ in (σ^2, σ^2 * Eye(d), σ^2 * I(d))
+                @test QH_22_PRE ≈ Matrix(PNDE.apply_diffusion(Q, Γ))
+            end
+
+            cache = (
+                d=d,
+                q=q,
+                A=A,
+                Q=Q,
+                P=P,
+                PI=PI,
+                Ah=Ah,
+                Qh=Qh,
+            )
+
+            make_transition_matrices!(cache, prior, h)
+            @test AH_22_IBM ≈ cache.Ah
+
+            for Γ in (σ^2, σ^2 * Eye(d), σ^2 * I(d))
+                @test QH_22_IBM ≈ Matrix(PNDE.apply_diffusion(cache.Qh, Γ))
+            end
+            if FAC != PNDE.IsometricKroneckerCovariance
+                @test QH_22_IBM ≈ Matrix(PNDE.apply_diffusion(cache.Qh, σ^2 * I(d)))
+            end
+        end
     end
 end
 
diff --git a/test/correctness.jl b/test/correctness.jl
index df76f16de..d43ab1175 100644
--- a/test/correctness.jl
+++ b/test/correctness.jl
@@ -30,6 +30,10 @@ CONSTANT_ALGS = (
     EK1(order=3, smooth=false, diffusionmodel=FixedDiffusion()) => 1e-8,
     EK1(order=3, smooth=false, initialization=ClassicSolverInit()) => 1e-7,
     EK1(order=3, smooth=false, initialization=SimpleInit()) => 1e-5,
+    DiagonalEK1(order=3, smooth=false) => 1e-7,
+    DiagonalEK1(order=3, smooth=false, diffusionmodel=FixedDiffusion()) => 1e-7,
+    DiagonalEK1(order=3, smooth=false, diffusionmodel=DynamicDiffusion()) => 1e-7,
+    DiagonalEK1(order=3, smooth=false, initialization=ClassicSolverInit()) => 1e-7,
     # smoothing
     EK0(order=3, smooth=true) => 1e-8,
     EK0(order=3, smooth=true, diffusionmodel=FixedDiffusion()) => 2e-8,
@@ -37,6 +41,10 @@ CONSTANT_ALGS = (
     EK0(order=3, smooth=true, diffusionmodel=DynamicMVDiffusion()) => 1e-8,
     EK1(order=3, smooth=true) => 1e-8,
     EK1(order=3, smooth=true, diffusionmodel=FixedDiffusion()) => 1e-8,
+    DiagonalEK1(order=3, smooth=true) => 1e-7,
+    DiagonalEK1(order=3, smooth=true, diffusionmodel=FixedDiffusion()) => 1e-7,
+    DiagonalEK1(order=3, smooth=true, diffusionmodel=DynamicDiffusion()) => 1e-7,
+    DiagonalEK1(order=3, smooth=true, initialization=ClassicSolverInit()) => 1e-7,
     # Priors
     EK0(prior=IOUP(3, -1), smooth=true) => 2e-9,
     EK1(prior=IOUP(3, -1), smooth=true, diffusionmodel=FixedDiffusion()) => 1e-9,
@@ -53,12 +61,17 @@ ADAPTIVE_ALGS = (
     EK0(order=3, initialization=ClassicSolverInit()) => 5e-5,
     EK0(order=3, initialization=SimpleInit()) => 1e-4,
     EK0(order=3, diffusionmodel=DynamicMVDiffusion(), initialization=ClassicSolverInit()) => 4e-5,
+    EK0(order=3, diffusionmodel=FixedMVDiffusion()) => 1e-4,
     EK1(order=2) => 2e-5,
     EK1(order=3) => 1e-5,
     EK1(order=5) => 1e-6,
     EK1(order=8) => 5e-6,
     EK1(order=3, initialization=ClassicSolverInit()) => 1e-5,
     EK1(order=3, initialization=SimpleInit()) => 1e-4,
+    DiagonalEK1(order=3) => 1e-4,
+    DiagonalEK1(order=3, diffusionmodel=FixedDiffusion()) => 1e-4,
+    DiagonalEK1(order=3, diffusionmodel=DynamicDiffusion()) => 1e-4,
+    DiagonalEK1(order=3, initialization=ClassicSolverInit()) => 1e-4,
     # Priors
     EK0(prior=IOUP(3, -1), smooth=true) => 1e-5,
     EK1(prior=IOUP(3, -1), smooth=true, diffusionmodel=FixedDiffusion()) => 1e-5,
diff --git a/test/data_likelihoods.jl b/test/data_likelihoods.jl
index 26a6b2596..9fe370616 100644
--- a/test/data_likelihoods.jl
+++ b/test/data_likelihoods.jl
@@ -33,25 +33,29 @@ kwargs = (
 )
 @testset "Compare data likelihoods" begin
     @testset "$alg" for alg in (
+        # EK0
+        EK0(),
+        EK0(diffusionmodel=FixedDiffusion()),
+        EK0(diffusionmodel=FixedMVDiffusion(rand(2), false)),
+        EK0(diffusionmodel=DynamicMVDiffusion()),
+        EK0(prior=IOUP(3, -1)),
+        EK0(prior=Matern(3, 1.5)),
+        # EK1
         EK1(),
         EK1(diffusionmodel=FixedDiffusion()),
         EK1(diffusionmodel=FixedMVDiffusion(rand(2), false)),
         EK1(prior=IOUP(3, -1)),
         EK1(prior=Matern(3, 1.5)),
         EK1(prior=IOUP(3, update_rate_parameter=true)),
+        # DiagonalEK1
+        DiagonalEK1(),
+        DiagonalEK1(diffusionmodel=FixedDiffusion()),
+        DiagonalEK1(diffusionmodel=FixedMVDiffusion(rand(2), false)),
     )
         compare_data_likelihoods(alg; kwargs...)
     end
 end
 
-@testset "EK0 is not (yet) supported" begin
-    for ll in (PNDE.dalton_data_loglik, PNDE.filtering_data_loglik)
-        @test_broken ll(prob, EK0(smooth=false); kwargs...)
-    end
-    @test_broken PNDE.fenrir_data_loglik(
-        prob, EK0(smooth=true); kwargs...)
-end
-
 @testset "Partial observations" begin
     H = [1 0;]
     data_part = (t=times, u=[H * d for d in obss])
@@ -63,6 +67,14 @@ end
         adaptive=false, dt=DT,
         dense=false,
     )
+    @test_broken compare_data_likelihoods(
+        DiagonalEK1();
+        observation_matrix=H,
+        observation_noise_cov=σ^2,
+        data=data_part,
+        adaptive=false, dt=DT,
+        dense=false,
+    )
 end
 
 @testset "Observation noise types: $(typeof(Σ))" for Σ in (
@@ -70,15 +82,27 @@ end
     σ^2 * I,
     σ^2 * I(2),
     σ^2 * Eye(2),
+    Diagonal([σ^2 0; 0 2σ^2]),
     [σ^2 0; 0 2σ^2],
     (A = randn(2, 2); A'A),
     (PSDMatrix(randn(2, 2))),
 )
-    compare_data_likelihoods(
-        EK1();
-        observation_noise_cov=Σ,
-        data=data,
-        adaptive=false, dt=DT,
-        dense=false,
-    )
+    @testset "$alg" for alg in (EK0(), DiagonalEK1(), EK1())
+        if alg isa EK0 && !(
+            Σ isa Number || Σ isa UniformScaling ||
+            Σ isa Diagonal{<:Number,<:FillArrays.Fill}
+        )
+            continue
+        end
+        if alg isa DiagonalEK1 && !(Σ isa Number || Σ isa UniformScaling || Σ isa Diagonal)
+            continue
+        end
+        compare_data_likelihoods(
+            alg;
+            observation_noise_cov=Σ,
+            data=data,
+            adaptive=false, dt=DT,
+            dense=false,
+        )
+    end
 end
diff --git a/test/errors_thrown.jl b/test/errors_thrown.jl
index 88c323259..562f2074f 100644
--- a/test/errors_thrown.jl
+++ b/test/errors_thrown.jl
@@ -31,3 +31,30 @@ end
     prior = IOUP(num_derivatives=1, rate_parameter=3, update_rate_parameter=true)
     @test_throws ArgumentError solve(prob, EK0(; prior))
 end
+
+@testset "Invalid prior" begin
+    prob = prob_ode_lotkavolterra
+    @test_throws DimensionMismatch solve(prob, EK0(prior=IWP(dim=3, num_derivatives=2)))
+    prior = IOUP(num_derivatives=1, rate_parameter=3, update_rate_parameter=true)
+    @test_throws ArgumentError solve(prob, EK0(; prior))
+end
+
+@testset "Invalid solver configurations" begin
+    prob = prob_ode_lotkavolterra
+
+    # Global calibration + observation noise doesn't work
+    @test_throws ArgumentError solve(
+        prob, EK0(pn_observation_noise=1, diffusionmodel=FixedDiffusion()))
+    @test_throws ArgumentError solve(
+        prob, EK0(pn_observation_noise=1, diffusionmodel=FixedMVDiffusion()))
+
+    # EK1 + Multivariate diffusion doesn't work:
+    @test_throws ArgumentError solve(
+        prob, EK1(diffusionmodel=FixedMVDiffusion()))
+    @test_throws ArgumentError solve(
+        prob, EK1(diffusionmodel=DynamicMVDiffusion()))
+
+    # Multivariate diffusion with non-diagonal diffusion model
+    @test_throws ArgumentError solve(
+        prob, EK0(diffusionmodel=FixedMVDiffusion(initial_diffusion=rand(2, 2))))
+end
diff --git a/test/mass_matrix.jl b/test/mass_matrix.jl
index b24ebe170..76786da85 100644
--- a/test/mass_matrix.jl
+++ b/test/mass_matrix.jl
@@ -34,11 +34,22 @@ using Test
             adaptive=false,
             dt=1e-2,
         )
+        diagonalek1() = solve(
+            prob,
+            DiagonalEK1(smooth=false),
+            save_everystep=false,
+            dense=false,
+            adaptive=false,
+            dt=1e-2,
+        )
         s1 = ek1()
         s0 = ek0()
+        s1diag = diagonalek1()
 
         ref = solve(prob, RadauIIA5(), abstol=1e-9, reltol=1e-6)
         @test s0.u[end] ≈ ref.u[end] rtol = 1e-7
+        @test s1.u[end] ≈ ref.u[end] rtol = 1e-7
+        @test s1diag.u[end] ≈ ref.u[end] rtol = 1e-7
 
         @test s1.pu.Σ[1] isa PSDMatrix{<:Number,<:Matrix}
         @test s0.pu.Σ[1] isa PSDMatrix{<:Number,<:ProbNumDiffEq.IsometricKroneckerProduct}
@@ -63,6 +74,7 @@ end
         0 1 0
         0 0 0
     ]
+    M = Diagonal([1, 1, 0])
     f = ODEFunction(rober, mass_matrix=M)
     prob = ODEProblem(f, [1.0, 0.0, 0.0], (0.0, 1e-2), (0.04, 3e7, 1e4))
 
@@ -78,4 +90,9 @@ end
 
     sol = solve(prob, EK1(order=3, initialization=SimpleInit()))
     @test sol.u[end] ≈ ref.u[end] rtol = 1e-8
+
+    sol = solve(prob, DiagonalEK1(order=3))
+    @test sol.u[end] ≈ ref.u[end] rtol = 1e-8
+
+    @test_throws ArgumentError solve(prob, EK0())
 end
diff --git a/test/observation_noise.jl b/test/observation_noise.jl
new file mode 100644
index 000000000..977fc8ac1
--- /dev/null
+++ b/test/observation_noise.jl
@@ -0,0 +1,28 @@
+using Test
+using ProbNumDiffEq
+using LinearAlgebra, FillArrays
+import ODEProblemLibrary: prob_ode_lotkavolterra
+
+prob = prob_ode_lotkavolterra
+d = length(prob.u0)
+@testset "typeof(R)=$(typeof(R))" for (i, R) in enumerate((
+    0.1,
+    0.1I,
+    0.1Eye(d),
+    0.1I(d),
+    Diagonal([0.1, 0.2]),
+    [0.1 0.01; 0.01 0.1],
+    PSDMatrix(0.1 * rand(d, d)),
+))
+    if i <= 3
+        @test_nowarn solve(prob, EK0(pn_observation_noise=R))
+    else
+        @test_broken solve(prob, EK0(pn_observation_noise=R))
+    end
+    if i <= 5
+        @test_nowarn solve(prob, DiagonalEK1(pn_observation_noise=R))
+    else
+        @test_broken solve(prob, DiagonalEK1(pn_observation_noise=R))
+    end
+    @test_nowarn solve(prob, EK1(pn_observation_noise=R))
+end
diff --git a/test/runtests.jl b/test/runtests.jl
index 966119b38..b39c91e9a 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -19,12 +19,21 @@ const GROUP = get(ENV, "GROUP", "All")
 @testset "ProbNumDiffEq" begin
     if GROUP == "All" || GROUP == "Core"
         @timedtestset "Core" begin
+            @timedsafetestset "BlockDiagonals" begin
+                include("core/blockdiagonals.jl")
+            end
+            @timedsafetestset "FastLinalg (`_matmul!`)" begin
+                include("core/fast_linalg.jl")
+            end
             @timedsafetestset "Filtering" begin
                 include("core/filtering.jl")
             end
             @timedsafetestset "Priors" begin
                 include("core/priors.jl")
             end
+            @timedsafetestset "Diffusions" begin
+                include("core/diffusions.jl")
+            end
             @timedsafetestset "Preconditioning" begin
                 include("core/preconditioning.jl")
             end
@@ -120,6 +129,9 @@ const GROUP = get(ENV, "GROUP", "All")
             @timedsafetestset "Data Likelihoods" begin
                 include("data_likelihoods.jl")
             end
+            @timedsafetestset "Observation noise" begin
+                include("observation_noise.jl")
+            end
         end
     end
 
diff --git a/test/secondorderode.jl b/test/secondorderode.jl
index b849af055..468a174b5 100644
--- a/test/secondorderode.jl
+++ b/test/secondorderode.jl
@@ -4,13 +4,13 @@ using LinearAlgebra
 using Test
 
 function twobody(du, u, p, t)
-    R3 = norm(u[1:2])^3
-    @. du[3:4] = -u[1:2] / R3
-    @. du[1:2] = u[3:4]
+    R3 = norm(u[3:4])^3
+    @. du[3:4] = u[1:2]
+    @. du[1:2] = -u[3:4] / R3
 end
 u0, du0 = [0.4, 0.0], [0.0, 2.0]
 tspan = (0, 0.1)
-prob_base = ODEProblem(twobody, [u0...; du0...], tspan)
+prob_base = ODEProblem(twobody, [du0...; u0...], tspan)
 
 function twobody2_iip(ddu, du, u, p, t)
     R3 = norm(u)^3
@@ -24,7 +24,7 @@ function twobody2_oop(du, u, p, t)
 end
 prob_oop = SecondOrderODEProblem(twobody2_oop, du0, u0, tspan)
 
-appxsol = solve(prob_iip, Vern9(), abstol=1e-10, reltol=1e-10)
+appxsol = solve(prob_base, Vern9(), abstol=1e-9, reltol=1e-6)
 
 @testset "$S" for (S, _prob) in (("IIP", prob_iip), ("OOP", prob_oop))
     @testset "$alg" for alg in (
@@ -34,8 +34,8 @@ appxsol = solve(prob_iip, Vern9(), abstol=1e-10, reltol=1e-10)
         EK1(initialization=ForwardDiffInit(2)),
         # EK1(initialization=ClassicSolverInit()), # unstable for this problem
         EK1(diffusionmodel=FixedDiffusion()),
-        EK0(diffusionmodel=FixedMVDiffusion()),
-        EK0(diffusionmodel=DynamicMVDiffusion()),
+        # EK0(diffusionmodel=FixedMVDiffusion()),
+        # EK0(diffusionmodel=DynamicMVDiffusion()),
     )
         sol = solve(_prob, alg)
 
diff --git a/test/state_init.jl b/test/state_init.jl
index 4330e5804..bf30c0082 100644
--- a/test/state_init.jl
+++ b/test/state_init.jl
@@ -64,7 +64,7 @@ import ODEProblemLibrary: prob_ode_fitzhughnagumo, prob_ode_pleiades
             prob,
             EK0(order=2, initialization=ClassicSolverInit(init_on_ddu=false)),
         )
-        @test_broken init(
+        @test_nowarn init(
             prob,
             EK0(order=2, initialization=ClassicSolverInit(init_on_ddu=true)),
         )
diff --git a/test/stiff_problem.jl b/test/stiff_problem.jl
index de8a6009f..e94a3ad54 100644
--- a/test/stiff_problem.jl
+++ b/test/stiff_problem.jl
@@ -8,3 +8,7 @@ appxsol = solve(prob, RadauIIA5())
 sol = solve(prob, EK1(order=3))
 @test appxsol.u[end] ≈ sol.u[end] rtol = 5e-3
 @test appxsol(0.5) ≈ sol(0.5).μ rtol = 5e-3
+
+sol = solve(prob, DiagonalEK1(order=3))
+@test appxsol.u[end] ≈ sol.u[end] rtol = 5e-3
+@test appxsol(0.5) ≈ sol(0.5).μ rtol = 5e-3