diff --git a/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl b/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl
new file mode 100644
index 00000000000..86b5ae64348
--- /dev/null
+++ b/examples/dgmulti_2d/elixir_navierstokes_convergence_curved.jl
@@ -0,0 +1,214 @@
+using OrdinaryDiffEq
+using Trixi
+
+###############################################################################
+# semidiscretization of the ideal compressible Navier-Stokes equations
+
+prandtl_number() = 0.72
+mu() = 0.01
+
+equations = CompressibleEulerEquations2D(1.4)
+# Note: If you change the Navier-Stokes parameters here, also change them in the initial condition
+# I really do not like this structure but it should work for now
+equations_parabolic = CompressibleNavierStokesDiffusion2D(equations, mu=mu(), Prandtl=prandtl_number(),
+                                                          gradient_variables=GradientVariablesPrimitive())
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+dg = DGMulti(polydeg = 3, element_type = Tri(), approximation_type = Polynomial(),
+             surface_integral = SurfaceIntegralWeakForm(flux_lax_friedrichs),
+             volume_integral = VolumeIntegralWeakForm())
+
+top_bottom(x, tol=50*eps()) = abs(abs(x[2]) - 1) < tol
+is_on_boundary = Dict(:top_bottom => top_bottom)
+
+function mapping(xi, eta)
+  x = xi  + 0.1 * sin(pi * xi) * sin(pi * eta)
+  y = eta + 0.1 * sin(pi * xi) * sin(pi * eta)
+  return SVector(x, y)
+end
+cells_per_dimension = (16, 16)
+mesh = DGMultiMesh(dg, cells_per_dimension, mapping; periodicity=(true, false), is_on_boundary)
+
+# This initial condition is taken from `examples/dgmulti_2d/elixir_navierstokes_convergence.jl`
+
+# Note: the initial condition cannot be specialized to `CompressibleNavierStokesDiffusion2D`
+#       since it is called by both the parabolic solver (which passes in `CompressibleNavierStokesDiffusion2D`)
+#       and by the initial condition (which passes in `CompressibleEulerEquations2D`).
+# This convergence test setup was originally derived by Andrew Winters (@andrewwinters5000)
+function initial_condition_navier_stokes_convergence_test(x, t, equations)
+  # Amplitude and shift
+  A = 0.5
+  c = 2.0
+
+  # convenience values for trig. functions
+  pi_x = pi * x[1]
+  pi_y = pi * x[2]
+  pi_t = pi * t
+
+  rho = c + A * sin(pi_x) * cos(pi_y) * cos(pi_t)
+  v1  = sin(pi_x) * log(x[2] + 2.0) * (1.0 - exp(-A * (x[2] - 1.0)) ) * cos(pi_t)
+  v2  = v1
+  p   = rho^2
+
+  return prim2cons(SVector(rho, v1, v2, p), equations)
+end
+
+@inline function source_terms_navier_stokes_convergence_test(u, x, t, equations)
+  y = x[2]
+
+  # TODO: parabolic
+  # we currently need to hardcode these parameters until we fix the "combined equation" issue
+  # see also https://github.com/trixi-framework/Trixi.jl/pull/1160
+  inv_gamma_minus_one = inv(equations.gamma - 1)
+  Pr = prandtl_number()
+  mu_ = mu()
+
+  # Same settings as in `initial_condition`
+  # Amplitude and shift
+  A = 0.5
+  c = 2.0
+
+  # convenience values for trig. functions
+  pi_x = pi * x[1]
+  pi_y = pi * x[2]
+  pi_t = pi * t
+
+  # compute the manufactured solution and all necessary derivatives
+  rho    =  c  + A * sin(pi_x) * cos(pi_y) * cos(pi_t)
+  rho_t  = -pi * A * sin(pi_x) * cos(pi_y) * sin(pi_t)
+  rho_x  =  pi * A * cos(pi_x) * cos(pi_y) * cos(pi_t)
+  rho_y  = -pi * A * sin(pi_x) * sin(pi_y) * cos(pi_t)
+  rho_xx = -pi * pi * A * sin(pi_x) * cos(pi_y) * cos(pi_t)
+  rho_yy = -pi * pi * A * sin(pi_x) * cos(pi_y) * cos(pi_t)
+
+  v1    =       sin(pi_x) * log(y + 2.0) * (1.0 - exp(-A * (y - 1.0))) * cos(pi_t)
+  v1_t  = -pi * sin(pi_x) * log(y + 2.0) * (1.0 - exp(-A * (y - 1.0))) * sin(pi_t)
+  v1_x  =  pi * cos(pi_x) * log(y + 2.0) * (1.0 - exp(-A * (y - 1.0))) * cos(pi_t)
+  v1_y  =       sin(pi_x) * (A * log(y + 2.0) * exp(-A * (y - 1.0)) + (1.0 - exp(-A * (y - 1.0))) / (y + 2.0)) * cos(pi_t)
+  v1_xx = -pi * pi * sin(pi_x) * log(y + 2.0) * (1.0 - exp(-A * (y - 1.0))) * cos(pi_t)
+  v1_xy =  pi * cos(pi_x) * (A * log(y + 2.0) * exp(-A * (y - 1.0)) + (1.0 - exp(-A * (y - 1.0))) / (y + 2.0)) * cos(pi_t)
+  v1_yy = (sin(pi_x) * ( 2.0 * A * exp(-A * (y - 1.0)) / (y + 2.0)
+                         - A * A * log(y + 2.0) * exp(-A * (y - 1.0))
+                         - (1.0 - exp(-A * (y - 1.0))) / ((y + 2.0) * (y + 2.0))) * cos(pi_t))
+  v2    = v1
+  v2_t  = v1_t
+  v2_x  = v1_x
+  v2_y  = v1_y
+  v2_xx = v1_xx
+  v2_xy = v1_xy
+  v2_yy = v1_yy
+
+  p    = rho * rho
+  p_t  = 2.0 * rho * rho_t
+  p_x  = 2.0 * rho * rho_x
+  p_y  = 2.0 * rho * rho_y
+  p_xx = 2.0 * rho * rho_xx + 2.0 * rho_x * rho_x
+  p_yy = 2.0 * rho * rho_yy + 2.0 * rho_y * rho_y
+
+  # Note this simplifies slightly because the ansatz assumes that v1 = v2
+  E   = p * inv_gamma_minus_one + 0.5 * rho * (v1^2 + v2^2)
+  E_t = p_t * inv_gamma_minus_one + rho_t * v1^2 + 2.0 * rho * v1 * v1_t
+  E_x = p_x * inv_gamma_minus_one + rho_x * v1^2 + 2.0 * rho * v1 * v1_x
+  E_y = p_y * inv_gamma_minus_one + rho_y * v1^2 + 2.0 * rho * v1 * v1_y
+
+  # Some convenience constants
+  T_const = equations.gamma * inv_gamma_minus_one / Pr
+  inv_rho_cubed = 1.0 / (rho^3)
+
+  # compute the source terms
+  # density equation
+  du1 = rho_t + rho_x * v1 + rho * v1_x + rho_y * v2 + rho * v2_y
+
+  # x-momentum equation
+  du2 = ( rho_t * v1 + rho * v1_t + p_x + rho_x * v1^2
+                                        + 2.0   * rho  * v1 * v1_x
+                                        + rho_y * v1   * v2
+                                        + rho   * v1_y * v2
+                                        + rho   * v1   * v2_y
+    # stress tensor from x-direction
+                      - 4.0 / 3.0 * v1_xx * mu_
+                      + 2.0 / 3.0 * v2_xy * mu_
+                      - v1_yy             * mu_
+                      - v2_xy             * mu_ )
+  # y-momentum equation
+  du3 = ( rho_t * v2 + rho * v2_t + p_y + rho_x * v1    * v2
+                                        + rho   * v1_x  * v2
+                                        + rho   * v1    * v2_x
+                                        +         rho_y * v2^2
+                                        + 2.0   * rho   * v2 * v2_y
+    # stress tensor from y-direction
+                      - v1_xy             * mu_
+                      - v2_xx             * mu_
+                      - 4.0 / 3.0 * v2_yy * mu_
+                      + 2.0 / 3.0 * v1_xy * mu_ )
+  # total energy equation
+  du4 = ( E_t + v1_x * (E + p) + v1 * (E_x + p_x)
+              + v2_y * (E + p) + v2 * (E_y + p_y)
+    # stress tensor and temperature gradient terms from x-direction
+                                - 4.0 / 3.0 * v1_xx * v1   * mu_
+                                + 2.0 / 3.0 * v2_xy * v1   * mu_
+                                - 4.0 / 3.0 * v1_x  * v1_x * mu_
+                                + 2.0 / 3.0 * v2_y  * v1_x * mu_
+                                - v1_xy     * v2           * mu_
+                                - v2_xx     * v2           * mu_
+                                - v1_y      * v2_x         * mu_
+                                - v2_x      * v2_x         * mu_
+         - T_const * inv_rho_cubed * (        p_xx * rho   * rho
+                                      - 2.0 * p_x  * rho   * rho_x
+                                      + 2.0 * p    * rho_x * rho_x
+                                      -       p    * rho   * rho_xx ) * mu_
+    # stress tensor and temperature gradient terms from y-direction
+                                - v1_yy     * v1           * mu_
+                                - v2_xy     * v1           * mu_
+                                - v1_y      * v1_y         * mu_
+                                - v2_x      * v1_y         * mu_
+                                - 4.0 / 3.0 * v2_yy * v2   * mu_
+                                + 2.0 / 3.0 * v1_xy * v2   * mu_
+                                - 4.0 / 3.0 * v2_y  * v2_y * mu_
+                                + 2.0 / 3.0 * v1_x  * v2_y * mu_
+         - T_const * inv_rho_cubed * (        p_yy * rho   * rho
+                                      - 2.0 * p_y  * rho   * rho_y
+                                      + 2.0 * p    * rho_y * rho_y
+                                      -       p    * rho   * rho_yy ) * mu_ )
+
+  return SVector(du1, du2, du3, du4)
+end
+
+initial_condition = initial_condition_navier_stokes_convergence_test
+
+# BC types
+velocity_bc_top_bottom = NoSlip((x, t, equations) -> initial_condition_navier_stokes_convergence_test(x, t, equations)[2:3])
+heat_bc_top_bottom = Adiabatic((x, t, equations) -> 0.0)
+boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_top_bottom, heat_bc_top_bottom)
+
+# define inviscid boundary conditions
+boundary_conditions = (; :top_bottom => boundary_condition_slip_wall)
+
+# define viscous boundary conditions
+boundary_conditions_parabolic = (; :top_bottom => boundary_condition_top_bottom)
+
+semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic), initial_condition, dg;
+                                             boundary_conditions=(boundary_conditions, boundary_conditions_parabolic),
+                                             source_terms=source_terms_navier_stokes_convergence_test)
+
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span `tspan`
+tspan = (0.0, 0.5)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+alive_callback = AliveCallback(alive_interval=10)
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval=analysis_interval, uEltype=real(dg))
+callbacks = CallbackSet(summary_callback, alive_callback)
+
+###############################################################################
+# run the simulation
+
+time_int_tol = 1e-8
+sol = solve(ode, RDPK3SpFSAL49(); abstol=time_int_tol, reltol=time_int_tol,
+            ode_default_options()..., callback=callbacks)
+summary_callback() # print the timer summary
diff --git a/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl b/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl
new file mode 100644
index 00000000000..c14d6620803
--- /dev/null
+++ b/examples/dgmulti_3d/elixir_navierstokes_convergence_curved.jl
@@ -0,0 +1,263 @@
+using OrdinaryDiffEq
+using Trixi
+
+###############################################################################
+# semidiscretization of the ideal compressible Navier-Stokes equations
+
+prandtl_number() = 0.72
+mu() = 0.01
+
+equations = CompressibleEulerEquations3D(1.4)
+equations_parabolic = CompressibleNavierStokesDiffusion3D(equations, mu=mu(), Prandtl=prandtl_number(),
+                                                          gradient_variables=GradientVariablesPrimitive())
+
+# Create DG solver with polynomial degree = 3 and (local) Lax-Friedrichs/Rusanov flux as surface flux
+dg = DGMulti(polydeg = 3, element_type = Hex(), approximation_type = Polynomial(),
+             surface_integral = SurfaceIntegralWeakForm(flux_lax_friedrichs),
+             volume_integral = VolumeIntegralWeakForm())
+
+top_bottom(x, tol=50*eps()) = abs(abs(x[2]) - 1) < tol
+is_on_boundary = Dict(:top_bottom => top_bottom)
+
+function mapping(xi, eta, zeta)
+  x = xi   + 0.1 * sin(pi * xi) * sin(pi * eta)
+  y = eta  + 0.1 * sin(pi * xi) * sin(pi * eta)
+  z = zeta + 0.1 * sin(pi * xi) * sin(pi * eta)
+  return SVector(x, y, z)
+end
+cells_per_dimension = (8, 8, 8)
+mesh = DGMultiMesh(dg, cells_per_dimension, mapping; periodicity=(true, false, true), is_on_boundary)
+
+# This initial condition is taken from `examples/dgmulti_3d/elixir_navierstokes_convergence.jl`
+
+# Note: the initial condition cannot be specialized to `CompressibleNavierStokesDiffusion3D`
+#       since it is called by both the parabolic solver (which passes in `CompressibleNavierStokesDiffusion3D`)
+#       and by the initial condition (which passes in `CompressibleEulerEquations3D`).
+# This convergence test setup was originally derived by Andrew Winters (@andrewwinters5000)
+function initial_condition_navier_stokes_convergence_test(x, t, equations)
+  # Constants. OBS! Must match those in `source_terms_navier_stokes_convergence_test`
+  c  = 2.0
+  A1 = 0.5
+  A2 = 1.0
+  A3 = 0.5
+
+  # Convenience values for trig. functions
+  pi_x = pi * x[1]
+  pi_y = pi * x[2]
+  pi_z = pi * x[3]
+  pi_t = pi * t
+
+  rho = c + A1 * sin(pi_x) * cos(pi_y) * sin(pi_z) * cos(pi_t)
+  v1  = A2 * sin(pi_x) * log(x[2] + 2.0) * (1.0 - exp(-A3 * (x[2] - 1.0))) * sin(pi_z) * cos(pi_t)
+  v2  = v1
+  v3  = v1
+  p   = rho^2
+
+  return prim2cons(SVector(rho, v1, v2, v3, p), equations)
+end
+
+@inline function source_terms_navier_stokes_convergence_test(u, x, t, equations)
+  # TODO: parabolic
+  # we currently need to hardcode these parameters until we fix the "combined equation" issue
+  # see also https://github.com/trixi-framework/Trixi.jl/pull/1160
+  inv_gamma_minus_one = inv(equations.gamma - 1)
+  Pr = prandtl_number()
+  mu_ = mu()
+
+  # Constants. OBS! Must match those in `initial_condition_navier_stokes_convergence_test`
+  c  = 2.0
+  A1 = 0.5
+  A2 = 1.0
+  A3 = 0.5
+
+  # Convenience values for trig. functions
+  pi_x = pi * x[1]
+  pi_y = pi * x[2]
+  pi_z = pi * x[3]
+  pi_t = pi * t
+
+  # Define auxiliary functions for the strange function of the y variable
+  # to make expressions easier to read
+  g    = log(x[2] + 2.0) * (1.0 - exp(-A3 * (x[2] - 1.0)))
+  g_y  = ( A3 * log(x[2] + 2.0) * exp(-A3 * (x[2] - 1.0))
+           + (1.0 - exp(-A3 * (x[2] - 1.0))) / (x[2] + 2.0) )
+  g_yy = ( 2.0 * A3 * exp(-A3 * (x[2] - 1.0)) / (x[2] + 2.0)
+           - (1.0 - exp(-A3 * (x[2] - 1.0))) / ((x[2] + 2.0)^2)
+           - A3^2 * log(x[2] + 2.0) * exp(-A3 * (x[2] - 1.0)) )
+
+  # Density and its derivatives
+  rho    =   c + A1 * sin(pi_x) * cos(pi_y) * sin(pi_z) * cos(pi_t)
+  rho_t  = -pi * A1 * sin(pi_x) * cos(pi_y) * sin(pi_z) * sin(pi_t)
+  rho_x  =  pi * A1 * cos(pi_x) * cos(pi_y) * sin(pi_z) * cos(pi_t)
+  rho_y  = -pi * A1 * sin(pi_x) * sin(pi_y) * sin(pi_z) * cos(pi_t)
+  rho_z  =  pi * A1 * sin(pi_x) * cos(pi_y) * cos(pi_z) * cos(pi_t)
+  rho_xx = -pi^2 * (rho - c)
+  rho_yy = -pi^2 * (rho - c)
+  rho_zz = -pi^2 * (rho - c)
+
+  # Velocities and their derivatives
+  # v1 terms
+  v1    =       A2 * sin(pi_x) * g * sin(pi_z) * cos(pi_t)
+  v1_t  = -pi * A2 * sin(pi_x) * g * sin(pi_z) * sin(pi_t)
+  v1_x  =  pi * A2 * cos(pi_x) * g * sin(pi_z) * cos(pi_t)
+  v1_y  =       A2 * sin(pi_x) * g_y * sin(pi_z) * cos(pi_t)
+  v1_z  =  pi * A2 * sin(pi_x) * g * cos(pi_z) * cos(pi_t)
+  v1_xx = -pi^2 * v1
+  v1_yy =       A2 * sin(pi_x) * g_yy * sin(pi_z) * cos(pi_t)
+  v1_zz = -pi^2 * v1
+  v1_xy =  pi * A2 * cos(pi_x) * g_y * sin(pi_z) * cos(pi_t)
+  v1_xz =  pi^2 * A2 * cos(pi_x) * g * cos(pi_z) * cos(pi_t)
+  v1_yz =  pi * A2 * sin(pi_x) * g_y * cos(pi_z) * cos(pi_t)
+  # v2 terms (simplifies from ansatz)
+  v2    = v1
+  v2_t  = v1_t
+  v2_x  = v1_x
+  v2_y  = v1_y
+  v2_z  = v1_z
+  v2_xx = v1_xx
+  v2_yy = v1_yy
+  v2_zz = v1_zz
+  v2_xy = v1_xy
+  v2_yz = v1_yz
+  # v3 terms (simplifies from ansatz)
+  v3    = v1
+  v3_t  = v1_t
+  v3_x  = v1_x
+  v3_y  = v1_y
+  v3_z  = v1_z
+  v3_xx = v1_xx
+  v3_yy = v1_yy
+  v3_zz = v1_zz
+  v3_xz = v1_xz
+  v3_yz = v1_yz
+
+  # Pressure and its derivatives
+  p    = rho^2
+  p_t  = 2.0 * rho * rho_t
+  p_x  = 2.0 * rho * rho_x
+  p_y  = 2.0 * rho * rho_y
+  p_z  = 2.0 * rho * rho_z
+
+  # Total energy and its derivatives; simiplifies from ansatz that v2 = v1 and v3 = v1
+  E   = p   * inv_gamma_minus_one + 1.5 * rho * v1^2
+  E_t = p_t * inv_gamma_minus_one + 1.5 * rho_t * v1^2 + 3.0 * rho * v1 * v1_t
+  E_x = p_x * inv_gamma_minus_one + 1.5 * rho_x * v1^2 + 3.0 * rho * v1 * v1_x
+  E_y = p_y * inv_gamma_minus_one + 1.5 * rho_y * v1^2 + 3.0 * rho * v1 * v1_y
+  E_z = p_z * inv_gamma_minus_one + 1.5 * rho_z * v1^2 + 3.0 * rho * v1 * v1_z
+
+  # Divergence of Fick's law ∇⋅∇q = kappa ∇⋅∇T; simplifies because p = rho², so T = p/rho = rho
+  kappa = equations.gamma * inv_gamma_minus_one / Pr
+  q_xx = kappa * rho_xx # kappa T_xx
+  q_yy = kappa * rho_yy # kappa T_yy
+  q_zz = kappa * rho_zz # kappa T_zz
+
+  # Stress tensor and its derivatives (exploit symmetry)
+  tau11 = 4.0 / 3.0 * v1_x - 2.0 / 3.0 * (v2_y + v3_z)
+  tau12 = v1_y + v2_x
+  tau13 = v1_z + v3_x
+  tau22 = 4.0 / 3.0 * v2_y - 2.0 / 3.0 * (v1_x + v3_z)
+  tau23 = v2_z + v3_y
+  tau33 = 4.0 / 3.0 * v3_z - 2.0 / 3.0 * (v1_x + v2_y)
+
+  tau11_x = 4.0 / 3.0 * v1_xx - 2.0 / 3.0 * (v2_xy + v3_xz)
+  tau12_x = v1_xy + v2_xx
+  tau13_x = v1_xz + v3_xx
+
+  tau12_y = v1_yy + v2_xy
+  tau22_y = 4.0 / 3.0 * v2_yy - 2.0 / 3.0 * (v1_xy + v3_yz)
+  tau23_y = v2_yz + v3_yy
+
+  tau13_z = v1_zz + v3_xz
+  tau23_z = v2_zz + v3_yz
+  tau33_z = 4.0 / 3.0 * v3_zz - 2.0 / 3.0 * (v1_xz + v2_yz)
+
+  # Compute the source terms
+  # Density equation
+  du1 = ( rho_t + rho_x * v1 + rho * v1_x
+                + rho_y * v2 + rho * v2_y
+                + rho_z * v3 + rho * v3_z )
+  # x-momentum equation
+  du2 = ( rho_t * v1 + rho * v1_t + p_x +         rho_x * v1^2
+                                        + 2.0   * rho   * v1 * v1_x
+                                        + rho_y * v1    * v2
+                                        + rho   * v1_y  * v2
+                                        + rho   * v1    * v2_y
+                                        + rho_z * v1    * v3
+                                        + rho   * v1_z  * v3
+                                        + rho   * v1    * v3_z
+                              - mu_ * (tau11_x + tau12_y + tau13_z) )
+  # y-momentum equation
+  du3 = ( rho_t * v2 + rho * v2_t + p_y + rho_x * v1    * v2
+                                        + rho   * v1_x  * v2
+                                        + rho   * v1    * v2_x
+                                        +         rho_y * v2^2
+                                        + 2.0   * rho   * v2 * v2_y
+                                        + rho_z * v2    * v3
+                                        + rho   * v2_z  * v3
+                                        + rho   * v2    * v3_z
+                              - mu_ * (tau12_x + tau22_y + tau23_z) )
+  # z-momentum equation
+  du4 = ( rho_t * v3 + rho * v3_t + p_z + rho_x * v1    * v3
+                                        + rho   * v1_x  * v3
+                                        + rho   * v1    * v3_x
+                                        + rho_y * v2    * v3
+                                        + rho   * v2_y  * v3
+                                        + rho   * v2    * v3_y
+                                        +         rho_z * v3^2
+                                        + 2.0   * rho   * v3 * v3_z
+                              - mu_ * (tau13_x + tau23_y + tau33_z) )
+  # Total energy equation
+  du5 = ( E_t + v1_x * (E + p) + v1 * (E_x + p_x)
+              + v2_y * (E + p) + v2 * (E_y + p_y)
+              + v3_z * (E + p) + v3 * (E_z + p_z)
+    # stress tensor and temperature gradient from x-direction
+      - mu_ * ( q_xx + v1_x * tau11   + v2_x * tau12   + v3_x * tau13
+                     + v1   * tau11_x + v2   * tau12_x + v3   * tau13_x)
+    # stress tensor and temperature gradient terms from y-direction
+      - mu_ * ( q_yy + v1_y * tau12   + v2_y * tau22   + v3_y * tau23
+                     + v1   * tau12_y + v2   * tau22_y + v3   * tau23_y)
+    # stress tensor and temperature gradient terms from z-direction
+      - mu_ * ( q_zz + v1_z * tau13   + v2_z * tau23   + v3_z * tau33
+                     + v1   * tau13_z + v2   * tau23_z + v3   * tau33_z) )
+
+  return SVector(du1, du2, du3, du4, du5)
+end
+
+initial_condition = initial_condition_navier_stokes_convergence_test
+
+# BC types
+velocity_bc_top_bottom = NoSlip((x, t, equations) -> initial_condition_navier_stokes_convergence_test(x, t, equations)[2:4])
+heat_bc_top_bottom = Adiabatic((x, t, equations) -> 0.0)
+boundary_condition_top_bottom = BoundaryConditionNavierStokesWall(velocity_bc_top_bottom, heat_bc_top_bottom)
+
+# define inviscid boundary conditions
+boundary_conditions = (; :top_bottom => boundary_condition_slip_wall)
+
+# define viscous boundary conditions
+boundary_conditions_parabolic = (; :top_bottom => boundary_condition_top_bottom)
+
+semi = SemidiscretizationHyperbolicParabolic(mesh, (equations, equations_parabolic), initial_condition, dg;
+                                             boundary_conditions=(boundary_conditions, boundary_conditions_parabolic),
+                                             source_terms=source_terms_navier_stokes_convergence_test)
+
+
+###############################################################################
+# ODE solvers, callbacks etc.
+
+# Create ODE problem with time span `tspan`
+tspan = (0.0, 1.0)
+ode = semidiscretize(semi, tspan)
+
+summary_callback = SummaryCallback()
+alive_callback = AliveCallback(alive_interval=10)
+analysis_interval = 100
+analysis_callback = AnalysisCallback(semi, interval=analysis_interval, uEltype=real(dg))
+callbacks = CallbackSet(summary_callback, alive_callback, analysis_callback)
+
+###############################################################################
+# run the simulation
+
+time_int_tol = 1e-8
+sol = solve(ode, RDPK3SpFSAL49(); abstol=time_int_tol, reltol=time_int_tol,
+            ode_default_options()..., callback=callbacks)
+summary_callback() # print the timer summary
diff --git a/src/equations/laplace_diffusion_2d.jl b/src/equations/laplace_diffusion_2d.jl
index 2f1afe25a6d..3963c616af2 100644
--- a/src/equations/laplace_diffusion_2d.jl
+++ b/src/equations/laplace_diffusion_2d.jl
@@ -29,7 +29,7 @@ end
 # The penalization depends on the solver, but also depends explicitly on physical parameters,
 # and would probably need to be specialized for every different equation.
 function penalty(u_outer, u_inner, inv_h, equations_parabolic::LaplaceDiffusion2D, dg::ViscousFormulationLocalDG)
-  return dg.penalty_parameter * (u_outer - u_inner) * equations_parabolic.diffusivity * inv_h
+  return dg.penalty_parameter * (u_outer - u_inner) * equations_parabolic.diffusivity
 end
 
 # Dirichlet-type boundary condition for use with a parabolic solver in weak form
diff --git a/src/solvers/dgmulti/dg.jl b/src/solvers/dgmulti/dg.jl
index dd6d9f43363..f9e30f8f871 100644
--- a/src/solvers/dgmulti/dg.jl
+++ b/src/solvers/dgmulti/dg.jl
@@ -489,8 +489,9 @@ end
 # inverts Jacobian and scales by -1.0
 function invert_jacobian!(du, mesh::DGMultiMesh, equations, dg::DGMulti, cache; scaling=-1)
   @threaded for e in eachelement(mesh, dg, cache)
+    invJ = cache.invJ[1, e]
     for i in axes(du, 1)
-      du[i, e] *= scaling * cache.invJ[i, e]
+      du[i, e] *= scaling * invJ
     end
   end
 end
diff --git a/src/solvers/dgmulti/dg_parabolic.jl b/src/solvers/dgmulti/dg_parabolic.jl
index 50cfd8ab17d..ef5153f421f 100644
--- a/src/solvers/dgmulti/dg_parabolic.jl
+++ b/src/solvers/dgmulti/dg_parabolic.jl
@@ -1,18 +1,33 @@
+# version for standard (e.g., non-entropy stable or flux differencing) schemes
 function create_cache_parabolic(mesh::DGMultiMesh,
                                 equations_hyperbolic::AbstractEquations,
                                 equations_parabolic::AbstractEquationsParabolic,
                                 dg::DGMulti, parabolic_scheme, RealT, uEltype)
-  # default to taking derivatives of all hyperbolic terms
+
+  # default to taking derivatives of all hyperbolic variables
   # TODO: parabolic; utilize the parabolic variables in `equations_parabolic` to reduce memory usage in the parabolic cache
   nvars = nvariables(equations_hyperbolic)
 
-  @unpack M, Drst = dg.basis
-  weak_differentiation_matrices = map(A -> -M \ (A' * M), Drst)
+  (; M, Vq, Pq, Drst) = dg.basis
+
+  # gradient operators: map from nodes to quadrature
+  strong_differentiation_matrices = map(A -> Vq * A, Drst)
+  gradient_lift_matrix = Vq * dg.basis.LIFT
+
+  # divergence operators: map from quadrature to nodes
+  weak_differentiation_matrices = map(A -> (M \ (-A' * M * Pq)), Drst)
+  divergence_lift_matrix = dg.basis.LIFT
+  projection_face_interpolation_matrix = dg.basis.Vf * dg.basis.Pq
+
+  # evaluate geometric terms at quadrature points in case the mesh is curved
+  (; md) = mesh
+  J = dg.basis.Vq * md.J
+  invJ = inv.(J)
+  dxidxhatj = map(x -> dg.basis.Vq * x, md.rstxyzJ)
 
   # u_transformed stores "transformed" variables for computing the gradient
-  @unpack md = mesh
   u_transformed = allocate_nested_array(uEltype, nvars, size(md.x), dg)
-  gradients = ntuple(_ -> similar(u_transformed), ndims(mesh))
+  gradients = SVector{ndims(mesh)}(ntuple(_ -> similar(u_transformed, (dg.basis.Nq, mesh.md.num_elements)), ndims(mesh)))
   flux_viscous = similar.(gradients)
 
   u_face_values = allocate_nested_array(uEltype, nvars, size(md.xf), dg)
@@ -20,20 +35,13 @@ function create_cache_parabolic(mesh::DGMultiMesh,
   gradients_face_values = ntuple(_ -> similar(u_face_values), ndims(mesh))
 
   local_u_values_threaded = [similar(u_transformed, dg.basis.Nq) for _ in 1:Threads.nthreads()]
-  local_flux_viscous_threaded = [ntuple(_ -> similar(u_transformed, dg.basis.Nq), ndims(mesh)) for _ in 1:Threads.nthreads()]
+  local_flux_viscous_threaded = [SVector{ndims(mesh)}(ntuple(_ -> similar(u_transformed, dg.basis.Nq), ndims(mesh))) for _ in 1:Threads.nthreads()]
   local_flux_face_values_threaded = [similar(scalar_flux_face_values[:, 1]) for _ in 1:Threads.nthreads()]
 
-  # precompute 1 / h for penalty terms
-  inv_h = similar(mesh.md.Jf)
-  J = dg.basis.Vf * mesh.md.J # interp to face nodes
-  for e in eachelement(mesh, dg)
-    for i in each_face_node(mesh, dg)
-      inv_h[i, e] = mesh.md.Jf[i, e] / J[i, e]
-    end
-  end
-
   return (; u_transformed, gradients, flux_viscous,
-            weak_differentiation_matrices, inv_h,
+            weak_differentiation_matrices, strong_differentiation_matrices,
+            gradient_lift_matrix, projection_face_interpolation_matrix, divergence_lift_matrix,
+            dxidxhatj, J, invJ, # geometric terms
             u_face_values, gradients_face_values, scalar_flux_face_values,
             local_u_values_threaded, local_flux_viscous_threaded, local_flux_face_values_threaded)
 end
@@ -48,17 +56,11 @@ function transform_variables!(u_transformed, u, mesh, equations_parabolic::Abstr
   end
 end
 
-# interpolates from solution coefficients to face quadrature points
-# We pass the `surface_integral` argument solely for dispatch
-function prolong2interfaces!(u_face_values, u, mesh::DGMultiMesh, equations::AbstractEquationsParabolic,
-                             surface_integral, dg::DGMulti, cache)
-  apply_to_each_field(mul_by!(dg.basis.Vf), u_face_values, u)
-end
-
-function calc_gradient_surface_integral(gradients, u, scalar_flux_face_values,
-                                        mesh, equations::AbstractEquationsParabolic,
-                                        dg::DGMulti, cache, cache_parabolic)
-  @unpack local_flux_face_values_threaded = cache_parabolic
+# TODO: reuse entropy projection computations for DGMultiFluxDiff{<:Polynomial} (including `GaussSBP` solvers)
+function calc_gradient_surface_integral!(gradients, u, scalar_flux_face_values,
+                                         mesh, equations::AbstractEquationsParabolic,
+                                         dg::DGMulti, cache, cache_parabolic)
+  (; gradient_lift_matrix, local_flux_face_values_threaded) = cache_parabolic
   @threaded for e in eachelement(mesh, dg)
     local_flux_values = local_flux_face_values_threaded[Threads.threadid()]
     for dim in eachdim(mesh)
@@ -66,52 +68,118 @@ function calc_gradient_surface_integral(gradients, u, scalar_flux_face_values,
         # compute flux * (nx, ny, nz)
         local_flux_values[i] = scalar_flux_face_values[i, e] * mesh.md.nxyzJ[dim][i, e]
       end
-      apply_to_each_field(mul_by_accum!(dg.basis.LIFT), view(gradients[dim], :, e), local_flux_values)
+      apply_to_each_field(mul_by_accum!(gradient_lift_matrix), view(gradients[dim], :, e), local_flux_values)
     end
   end
 end
 
-function calc_gradient!(gradients, u::StructArray, t, mesh::DGMultiMesh,
-                        equations::AbstractEquationsParabolic,
-                        boundary_conditions, dg::DGMulti, cache, cache_parabolic)
+function calc_gradient_volume_integral!(gradients, u, mesh::DGMultiMesh,
+                                        equations::AbstractEquationsParabolic,
+                                        dg::DGMulti, cache, cache_parabolic)
 
-  @unpack weak_differentiation_matrices = cache_parabolic
+  (; strong_differentiation_matrices) = cache_parabolic
 
-  for dim in eachindex(gradients)
-    reset_du!(gradients[dim], dg)
+  # compute volume contributions to gradients
+  @threaded for e in eachelement(mesh, dg)
+    for i in eachdim(mesh), j in eachdim(mesh)
+
+      # We assume each element is affine (e.g., constant geometric terms) here.
+      dxidxhatj = mesh.md.rstxyzJ[i, j][1, e]
+
+      apply_to_each_field(mul_by_accum!(strong_differentiation_matrices[j], dxidxhatj),
+                          view(gradients[i], :, e), view(u, :, e))
+    end
   end
+end
+
+function calc_gradient_volume_integral!(gradients, u, mesh::DGMultiMesh{NDIMS, <:NonAffine},
+                                        equations::AbstractEquationsParabolic,
+                                        dg::DGMulti, cache, cache_parabolic) where {NDIMS}
+
+  (; strong_differentiation_matrices, dxidxhatj, local_flux_viscous_threaded) = cache_parabolic
 
   # compute volume contributions to gradients
   @threaded for e in eachelement(mesh, dg)
+
+    # compute gradients with respect to reference coordinates
+    local_reference_gradients = local_flux_viscous_threaded[Threads.threadid()]
+    for i in eachdim(mesh)
+      apply_to_each_field(mul_by!(strong_differentiation_matrices[i]),
+                                  local_reference_gradients[i], view(u, :, e))
+    end
+
+    # rotate to physical frame on each element
     for i in eachdim(mesh), j in eachdim(mesh)
-      dxidxhatj = mesh.md.rstxyzJ[i, j][1, e] # TODO: DGMulti. Assumes mesh is affine here.
-      apply_to_each_field(mul_by_accum!(weak_differentiation_matrices[j], dxidxhatj),
-                          view(gradients[i], :, e), view(u, :, e))
+      for node in eachindex(local_reference_gradients[j])
+        gradients[i][node, e] = gradients[i][node, e] + dxidxhatj[i, j][node, e] * local_reference_gradients[j][node]
+      end
     end
   end
+end
 
-  @unpack u_face_values = cache_parabolic
-  prolong2interfaces!(u_face_values, u, mesh, equations, dg.surface_integral, dg, cache)
+function calc_gradient!(gradients, u::StructArray, t, mesh::DGMultiMesh,
+                        equations::AbstractEquationsParabolic,
+                        boundary_conditions, dg::DGMulti, cache, cache_parabolic)
+
+  for dim in eachindex(gradients)
+    reset_du!(gradients[dim], dg)
+  end
+
+  calc_gradient_volume_integral!(gradients, u, mesh, equations, dg, cache, cache_parabolic)
+
+  (; u_face_values) = cache_parabolic
+  apply_to_each_field(mul_by!(dg.basis.Vf), u_face_values, u)
 
   # compute fluxes at interfaces
-  @unpack scalar_flux_face_values = cache_parabolic
-  @unpack mapM, mapP, Jf = mesh.md
+  (; scalar_flux_face_values) = cache_parabolic
+  (; mapM, mapP) = mesh.md
   @threaded for face_node_index in each_face_node_global(mesh, dg)
     idM, idP = mapM[face_node_index], mapP[face_node_index]
     uM = u_face_values[idM]
     uP = u_face_values[idP]
-    scalar_flux_face_values[idM] = 0.5 * (uP + uM) # TODO: use strong/weak formulation for curved meshes?
+    # Here, we use the "strong" formulation to compute the gradient. This guarantees that the parabolic
+    # formulation is symmetric and stable on curved meshes with variable geometric terms.
+    scalar_flux_face_values[idM] = 0.5 * (uP - uM)
   end
 
   calc_boundary_flux!(scalar_flux_face_values, u_face_values, t, Gradient(), boundary_conditions,
                       mesh, equations, dg, cache, cache_parabolic)
 
   # compute surface contributions
-  calc_gradient_surface_integral(gradients, u, scalar_flux_face_values,
-                                 mesh, equations, dg, cache, cache_parabolic)
+  calc_gradient_surface_integral!(gradients, u, scalar_flux_face_values,
+                                  mesh, equations, dg, cache, cache_parabolic)
 
-  for dim in eachdim(mesh)
-    invert_jacobian!(gradients[dim], mesh, equations, dg, cache; scaling=1.0)
+  invert_jacobian_gradient!(gradients, mesh, equations, dg, cache, cache_parabolic)
+
+end
+
+# affine mesh - constant Jacobian version
+function invert_jacobian_gradient!(gradients, mesh::DGMultiMesh, equations, dg::DGMulti,
+                                   cache, cache_parabolic)
+  @threaded for e in eachelement(mesh, dg)
+
+    # Here, we exploit the fact that J is constant on affine elements,
+    # so we only have to access invJ once per element.
+    invJ = cache_parabolic.invJ[1, e]
+
+    for dim in eachdim(mesh)
+      for i in axes(gradients[dim], 1)
+        gradients[dim][i, e] = gradients[dim][i, e] * invJ
+      end
+    end
+  end
+end
+
+# non-affine mesh - variable Jacobian version
+function invert_jacobian_gradient!(gradients, mesh::DGMultiMesh{NDIMS, <:NonAffine}, equations,
+                                   dg::DGMulti, cache, cache_parabolic) where {NDIMS}
+  (; invJ) = cache_parabolic
+  @threaded for e in eachelement(mesh, dg)
+    for dim in eachdim(mesh)
+      for i in axes(gradients[dim], 1)
+        gradients[dim][i, e] = gradients[dim][i, e] * invJ[i, e]
+      end
+    end
   end
 end
 
@@ -139,7 +207,6 @@ end
 calc_boundary_flux!(flux, u, t, operator_type, boundary_conditions::NamedTuple{(),Tuple{}},
                     mesh, equations, dg::DGMulti, cache, cache_parabolic) = nothing
 
-# TODO: DGMulti. Decide if we want to use the input `u_face_values` (currently unused)
 function calc_single_boundary_flux!(flux_face_values, u_face_values, t,
                                     operator_type, boundary_condition, boundary_key,
                                     mesh, equations, dg::DGMulti{NDIMS}, cache, cache_parabolic) where {NDIMS}
@@ -147,7 +214,7 @@ function calc_single_boundary_flux!(flux_face_values, u_face_values, t,
   md = mesh.md
 
   num_pts_per_face = rd.Nfq ÷ rd.Nfaces
-  @unpack xyzf, nxyzJ, Jf = md
+  (; xyzf, nxyz) = md
   for f in mesh.boundary_faces[boundary_key]
     for i in Base.OneTo(num_pts_per_face)
 
@@ -155,7 +222,7 @@ function calc_single_boundary_flux!(flux_face_values, u_face_values, t,
       e = ((f-1) ÷ rd.Nfaces) + 1
       fid = i + ((f-1) % rd.Nfaces) * num_pts_per_face
 
-      face_normal = SVector{NDIMS}(getindex.(nxyzJ, fid, e)) / Jf[fid,e]
+      face_normal = SVector{NDIMS}(getindex.(nxyz, fid, e))
       face_coordinates = SVector{NDIMS}(getindex.(xyzf, fid, e))
 
       # for both the gradient and the divergence, the boundary flux is scalar valued.
@@ -163,6 +230,15 @@ function calc_single_boundary_flux!(flux_face_values, u_face_values, t,
       flux_face_values[fid,e] = boundary_condition(flux_face_values[fid,e], u_face_values[fid,e],
                                                    face_normal, face_coordinates, t,
                                                    operator_type, equations)
+
+      # Here, we use the "strong form" for the Gradient (and the "weak form" for Divergence).
+      # `flux_face_values` should contain the boundary values for `u`, and we
+      # subtract off `u_face_values[fid, e]` because we are using the strong formulation to
+      # compute the gradient.
+      if operator_type isa Gradient
+        flux_face_values[fid, e] = flux_face_values[fid, e] - u_face_values[fid, e]
+      end
+
     end
   end
   return nothing
@@ -176,38 +252,26 @@ function calc_viscous_fluxes!(flux_viscous, u, gradients, mesh::DGMultiMesh,
     reset_du!(flux_viscous[dim], dg)
   end
 
-  @unpack local_flux_viscous_threaded, local_u_values_threaded = cache_parabolic
+  (; local_u_values_threaded) = cache_parabolic
 
   @threaded for e in eachelement(mesh, dg)
 
-    # reset local storage for each element
-    local_flux_viscous = local_flux_viscous_threaded[Threads.threadid()]
+    # reset local storage for each element, interpolate u to quadrature points
+    # TODO: DGMulti. Specialize for nodal collocation methods (SBP, GaussSBP)?
     local_u_values = local_u_values_threaded[Threads.threadid()]
     fill!(local_u_values, zero(eltype(local_u_values)))
-    for dim in eachdim(mesh)
-      fill!(local_flux_viscous[dim], zero(eltype(local_flux_viscous[dim])))
-    end
-
-    # interpolate u and gradient to quadrature points, store in `local_flux_viscous`
-    apply_to_each_field(mul_by!(dg.basis.Vq), local_u_values, view(u, :, e)) # TODO: DGMulti. Specialize for nodal collocation methods (SBP, GaussSBP)
-    for dim in eachdim(mesh)
-      apply_to_each_field(mul_by!(dg.basis.Vq), local_flux_viscous[dim], view(gradients[dim], :, e))
-    end
+    apply_to_each_field(mul_by!(dg.basis.Vq), local_u_values, view(u, :, e))
 
     # compute viscous flux at quad points
     for i in eachindex(local_u_values)
       u_i = local_u_values[i]
-      gradients_i = getindex.(local_flux_viscous, i)
+      gradients_i = getindex.(gradients, i, e)
       for dim in eachdim(mesh)
         flux_viscous_i = flux(u_i, gradients_i, dim, equations)
-        setindex!(local_flux_viscous[dim], flux_viscous_i, i)
+        setindex!(flux_viscous[dim], flux_viscous_i, i, e)
       end
     end
 
-    # project back to the DG approximation space
-    for dim in eachdim(mesh)
-      apply_to_each_field(mul_by!(dg.basis.Pq), view(flux_viscous[dim], :, e), local_flux_viscous[dim])
-    end
   end
 end
 
@@ -222,55 +286,85 @@ function calc_viscous_penalty!(scalar_flux_face_values, u_face_values, t, bounda
                                mesh, equations::AbstractEquationsParabolic, dg::DGMulti,
                                parabolic_scheme, cache, cache_parabolic)
   # compute fluxes at interfaces
-  @unpack scalar_flux_face_values, inv_h = cache_parabolic
-  @unpack mapM, mapP = mesh.md
+  (; scalar_flux_face_values) = cache_parabolic
+  (; mapM, mapP) = mesh.md
   @threaded for face_node_index in each_face_node_global(mesh, dg)
     idM, idP = mapM[face_node_index], mapP[face_node_index]
     uM, uP = u_face_values[idM], u_face_values[idP]
-    inv_h_face = inv_h[face_node_index]
-    scalar_flux_face_values[idM] = scalar_flux_face_values[idM] + penalty(uP, uM, inv_h_face, equations, parabolic_scheme)
+    scalar_flux_face_values[idM] = scalar_flux_face_values[idM] + penalty(uP, uM, equations, parabolic_scheme)
   end
   return nothing
 end
 
-
-function calc_divergence!(du, u::StructArray, t, flux_viscous, mesh::DGMultiMesh,
-                          equations::AbstractEquationsParabolic,
-                          boundary_conditions, dg::DGMulti, parabolic_scheme, cache, cache_parabolic)
-
-  @unpack weak_differentiation_matrices = cache_parabolic
-
-  reset_du!(du, dg)
+function calc_divergence_volume_integral!(du, u, flux_viscous, mesh::DGMultiMesh,
+                                          equations::AbstractEquationsParabolic,
+                                          dg::DGMulti, cache, cache_parabolic)
+  (; weak_differentiation_matrices) = cache_parabolic
 
   # compute volume contributions to divergence
   @threaded for e in eachelement(mesh, dg)
     for i in eachdim(mesh), j in eachdim(mesh)
       dxidxhatj = mesh.md.rstxyzJ[i, j][1, e] # assumes mesh is affine
       apply_to_each_field(mul_by_accum!(weak_differentiation_matrices[j], dxidxhatj),
-                                view(du, :, e), view(flux_viscous[i], :, e))
+                          view(du, :, e), view(flux_viscous[i], :, e))
     end
   end
+end
+
+function calc_divergence_volume_integral!(du, u, flux_viscous, mesh::DGMultiMesh{NDIMS, <:NonAffine},
+                                          equations::AbstractEquationsParabolic,
+                                          dg::DGMulti, cache, cache_parabolic) where {NDIMS}
+  (; weak_differentiation_matrices, dxidxhatj, local_flux_viscous_threaded) = cache_parabolic
+
+  # compute volume contributions to divergence
+  @threaded for e in eachelement(mesh, dg)
+
+    local_viscous_flux = local_flux_viscous_threaded[Threads.threadid()][1]
+    for i in eachdim(mesh)
+      # rotate flux to reference coordinates
+      fill!(local_viscous_flux, zero(eltype(local_viscous_flux)))
+      for j in eachdim(mesh)
+        for node in eachindex(local_viscous_flux)
+          local_viscous_flux[node] = local_viscous_flux[node] + dxidxhatj[j, i][node, e] * flux_viscous[j][node, e]
+        end
+      end
+
+      # differentiate with respect to reference coordinates
+      apply_to_each_field(mul_by_accum!(weak_differentiation_matrices[i]),
+                          view(du, :, e), local_viscous_flux)
+    end
+  end
+end
+
+function calc_divergence!(du, u::StructArray, t, flux_viscous, mesh::DGMultiMesh,
+                          equations::AbstractEquationsParabolic,
+                          boundary_conditions, dg::DGMulti, parabolic_scheme, cache, cache_parabolic)
+
+  reset_du!(du, dg)
+
+  calc_divergence_volume_integral!(du, u, flux_viscous, mesh, equations, dg, cache, cache_parabolic)
 
   # interpolates from solution coefficients to face quadrature points
+  (; projection_face_interpolation_matrix) = cache_parabolic
   flux_viscous_face_values = cache_parabolic.gradients_face_values # reuse storage
   for dim in eachdim(mesh)
-    prolong2interfaces!(flux_viscous_face_values[dim], flux_viscous[dim], mesh, equations,
-                        dg.surface_integral, dg, cache)
+    apply_to_each_field(mul_by!(projection_face_interpolation_matrix), flux_viscous_face_values[dim], flux_viscous[dim])
   end
 
   # compute fluxes at interfaces
-  @unpack scalar_flux_face_values = cache_parabolic
-  @unpack mapM, mapP, nxyzJ = mesh.md
+  (; scalar_flux_face_values) = cache_parabolic
+  (; mapM, mapP, nxyzJ) = mesh.md
+
   @threaded for face_node_index in each_face_node_global(mesh, dg, cache, cache_parabolic)
     idM, idP = mapM[face_node_index], mapP[face_node_index]
 
     # compute f(u, ∇u) ⋅ n
     flux_face_value = zero(eltype(scalar_flux_face_values))
     for dim in eachdim(mesh)
-      uM = flux_viscous_face_values[dim][idM]
-      uP = flux_viscous_face_values[dim][idP]
-      # TODO: use strong/weak formulation to ensure stability on curved meshes?
-      flux_face_value = flux_face_value + 0.5 * (uP + uM) * nxyzJ[dim][face_node_index]
+      fM = flux_viscous_face_values[dim][idM]
+      fP = flux_viscous_face_values[dim][idP]
+      # Here, we use the "weak" formulation to compute the divergence (to ensure stability on curved meshes).
+      flux_face_value = flux_face_value + 0.5 * (fP + fM) * nxyzJ[dim][face_node_index]
     end
     scalar_flux_face_values[idM] = flux_face_value
   end
@@ -283,7 +377,7 @@ function calc_divergence!(du, u::StructArray, t, flux_viscous, mesh::DGMultiMesh
                         cache, cache_parabolic)
 
   # surface contributions
-  apply_to_each_field(mul_by_accum!(dg.basis.LIFT), du, scalar_flux_face_values)
+  apply_to_each_field(mul_by_accum!(cache_parabolic.divergence_lift_matrix), du, scalar_flux_face_values)
 
   # Note: we do not flip the sign of the geometric Jacobian here.
   # This is because the parabolic fluxes are assumed to be of the form
@@ -304,19 +398,26 @@ function rhs_parabolic!(du, u, t, mesh::DGMultiMesh, equations_parabolic::Abstra
 
   reset_du!(du, dg)
 
-  @unpack u_transformed, gradients, flux_viscous = cache_parabolic
-  transform_variables!(u_transformed, u, mesh, equations_parabolic,
-                       dg, parabolic_scheme, cache, cache_parabolic)
-
-  calc_gradient!(gradients, u_transformed, t, mesh, equations_parabolic,
-                 boundary_conditions, dg, cache, cache_parabolic)
+  @trixi_timeit timer() "transform variables" begin
+    (; u_transformed, gradients, flux_viscous) = cache_parabolic
+    transform_variables!(u_transformed, u, mesh, equations_parabolic,
+                        dg, parabolic_scheme, cache, cache_parabolic)
+  end
 
-  calc_viscous_fluxes!(flux_viscous, u_transformed, gradients,
-                       mesh, equations_parabolic, dg, cache, cache_parabolic)
+  @trixi_timeit timer() "calc gradient" begin
+    calc_gradient!(gradients, u_transformed, t, mesh, equations_parabolic,
+                  boundary_conditions, dg, cache, cache_parabolic)
+  end
 
-  calc_divergence!(du, u_transformed, t, flux_viscous, mesh, equations_parabolic,
-                   boundary_conditions, dg, parabolic_scheme, cache, cache_parabolic)
+  @trixi_timeit timer() "calc viscous fluxes" begin
+    calc_viscous_fluxes!(flux_viscous, u_transformed, gradients,
+                        mesh, equations_parabolic, dg, cache, cache_parabolic)
+  end
 
+  @trixi_timeit timer() "calc divergence" begin
+    calc_divergence!(du, u_transformed, t, flux_viscous, mesh, equations_parabolic,
+                    boundary_conditions, dg, parabolic_scheme, cache, cache_parabolic)
+  end
   return nothing
 
 end
diff --git a/test/test_parabolic_2d.jl b/test/test_parabolic_2d.jl
index ac4adbfd69b..588f43e4543 100644
--- a/test/test_parabolic_2d.jl
+++ b/test/test_parabolic_2d.jl
@@ -53,9 +53,9 @@ isdir(outdir) && rm(outdir, recursive=true)
     # pass in `boundary_condition_periodic` to skip boundary flux/integral evaluation
     Trixi.calc_gradient!(gradients, ode.u0, t, mesh, equations_parabolic,
                          boundary_condition_periodic, dg, cache, cache_parabolic)
-    @unpack x, y = mesh.md
-    @test getindex.(gradients[1], 1) ≈ 2 * x .* y
-    @test getindex.(gradients[2], 1) ≈ x.^2
+    @unpack x, y, xq, yq = mesh.md
+    @test getindex.(gradients[1], 1) ≈ 2 * xq .* yq
+    @test getindex.(gradients[2], 1) ≈ xq.^2
 
     u_flux = similar.(gradients)
     Trixi.calc_viscous_fluxes!(u_flux, ode.u0, gradients, mesh, equations_parabolic,
@@ -101,6 +101,14 @@ isdir(outdir) && rm(outdir, recursive=true)
     )
   end
 
+  @trixi_testset "DGMulti: elixir_navierstokes_convergence_curved.jl" begin
+    @test_trixi_include(joinpath(examples_dir(), "dgmulti_2d", "elixir_navierstokes_convergence_curved.jl"),
+      cells_per_dimension = (4, 4), tspan=(0.0, 0.1),
+      l2 = [0.004255101916146187, 0.011118488923215765, 0.011281831283462686, 0.03573656447388509],
+      linf = [0.015071710669706473, 0.04103132025858458, 0.03990424085750277, 0.1309401718598764],
+    )
+  end
+
   @trixi_testset "DGMulti: elixir_navierstokes_lid_driven_cavity.jl" begin
     @test_trixi_include(joinpath(examples_dir(), "dgmulti_2d", "elixir_navierstokes_lid_driven_cavity.jl"),
       cells_per_dimension = (4, 4), tspan=(0.0, 0.5),
diff --git a/test/test_parabolic_3d.jl b/test/test_parabolic_3d.jl
index 8799e5ccdae..1ae5eed44ae 100644
--- a/test/test_parabolic_3d.jl
+++ b/test/test_parabolic_3d.jl
@@ -19,6 +19,14 @@ isdir(outdir) && rm(outdir, recursive=true)
     )
   end
 
+  @trixi_testset "DGMulti: elixir_navierstokes_convergence_curved.jl" begin
+    @test_trixi_include(joinpath(examples_dir(), "dgmulti_3d", "elixir_navierstokes_convergence_curved.jl"),
+      cells_per_dimension = (4, 4, 4), tspan=(0.0, 0.1),
+      l2 = [0.0014027227251207474, 0.0021322235533273513, 0.0027873741447455194, 0.0024587473070627423, 0.00997836818019202],
+      linf = [0.006341750402837576, 0.010306014252246865, 0.01520740250924979, 0.010968264045485565, 0.047454389831591115]
+    )
+  end
+
   @trixi_testset "DGMulti: elixir_navierstokes_taylor_green_vortex.jl" begin
     @test_trixi_include(joinpath(examples_dir(), "dgmulti_3d", "elixir_navierstokes_taylor_green_vortex.jl"),
       cells_per_dimension = (4, 4, 4), tspan=(0.0, 0.25),