diff --git a/Project.toml b/Project.toml
index b1724f423..5033ab24a 100644
--- a/Project.toml
+++ b/Project.toml
@@ -27,6 +27,7 @@ UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
 ArrayInterface = "6.0.24, 7"
 DiffEqBase = "6"
 EnumX = "1"
+Enzyme = "0.11"
 FiniteDiff = "2"
 ForwardDiff = "0.10.3"
 LinearSolve = "2"
@@ -38,19 +39,23 @@ SimpleNonlinearSolve = "0.1"
 SparseDiffTools = "1, 2"
 StaticArraysCore = "1.4"
 UnPack = "1.0"
+Zygote = "0.6"
 julia = "1.6"
 
 [extras]
 BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9"
 ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LinearSolve = "7ed4a6bd-45f5-4d41-b270-4a48e9bafcae"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 SafeTestsets = "1bc83da4-3b8d-516f-aca4-4fe02f6d838f"
+SparseDiffTools = "47a9eef4-7e08-11e9-0b38-333d64bd3804"
 StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [targets]
-test = ["BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra"]
+test = ["Enzyme", "BenchmarkTools", "SafeTestsets", "Pkg", "Test", "ForwardDiff", "StaticArrays", "Symbolics", "LinearSolve", "Random", "LinearAlgebra", "Zygote", "SparseDiffTools"]
diff --git a/src/NonlinearSolve.jl b/src/NonlinearSolve.jl
index b774b7953..9fd4bb31d 100644
--- a/src/NonlinearSolve.jl
+++ b/src/NonlinearSolve.jl
@@ -8,15 +8,16 @@ using DiffEqBase, LinearAlgebra, LinearSolve, SparseDiffTools
 import ForwardDiff
 
 import ADTypes: AbstractFiniteDifferencesMode
-import ArrayInterface: undefmatrix, matrix_colors
+import ArrayInterface: undefmatrix, matrix_colors, parameterless_type, ismutable
 import ConcreteStructs: @concrete
 import EnumX: @enumx
 import ForwardDiff: Dual
 import LinearSolve: ComposePreconditioner, InvPreconditioner, needs_concrete_A
-import RecursiveArrayTools: AbstractVectorOfArray, recursivecopy!, recursivefill!
+import RecursiveArrayTools: ArrayPartition,
+    AbstractVectorOfArray, recursivecopy!, recursivefill!
 import Reexport: @reexport
 import SciMLBase: AbstractNonlinearAlgorithm, NLStats, _unwrap_val, has_jac, isinplace
-import StaticArraysCore: StaticArray, SVector
+import StaticArraysCore: StaticArray, SVector, SArray, MArray
 import UnPack: @unpack
 
 @reexport using ADTypes, SciMLBase, SimpleNonlinearSolve
@@ -33,8 +34,6 @@ function SciMLBase.__solve(prob::NonlinearProblem, alg::AbstractNonlinearSolveAl
     return solve!(cache)
 end
 
-# FIXME: Scalar Case is Completely Broken
-
 include("utils.jl")
 include("raphson.jl")
 include("trustRegion.jl")
diff --git a/src/jacobian.jl b/src/jacobian.jl
index 2a96432d7..9c7f6e721 100644
--- a/src/jacobian.jl
+++ b/src/jacobian.jl
@@ -36,12 +36,16 @@ function jacobian!!(J::Union{AbstractMatrix{<:Number}, Nothing}, cache)
     @unpack f, uf, u, p, jac_cache, alg, fu2 = cache
     iip = isinplace(cache)
     if iip
-        has_jac(f) ? f.jac(J, u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, u)
+        has_jac(f) ? f.jac(J, u, p) :
+        sparse_jacobian!(J, alg.ad, jac_cache, uf, fu2, _maybe_mutable(u, alg.ad))
     else
-        return has_jac(f) ? f.jac(u, p) : sparse_jacobian!(J, alg.ad, jac_cache, uf, u)
+        return has_jac(f) ? f.jac(u, p) :
+               sparse_jacobian!(J, alg.ad, jac_cache, uf, _maybe_mutable(u, alg.ad))
     end
-    return nothing
+    return J
 end
+# Scalar case
+jacobian!!(::Number, cache) = last(value_derivative(cache.uf, cache.u))
 
 # Build Jacobian Caches
 function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
@@ -54,15 +58,16 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
     linsolve_needs_jac = (concrete_jac(alg) === nothing &&
                           (!haslinsolve || (haslinsolve && (alg.linsolve === nothing ||
                              needs_concrete_A(alg.linsolve)))))
-    alg_wants_jac = (concrete_jac(alg) === nothing && concrete_jac(alg))
+    alg_wants_jac = (concrete_jac(alg) !== nothing && concrete_jac(alg))
 
     # NOTE: The deepcopy is needed here since we are using the resid_prototype elsewhere
-    fu = f.resid_prototype === nothing ? (iip ? zero(u) : f(u, p)) :
-         deepcopy(f.resid_prototype)
+    fu = f.resid_prototype === nothing ? (iip ? _mutable_zero(u) : _mutable(f(u, p))) :
+         (iip ? deepcopy(f.resid_prototype) : f.resid_prototype)
     if !has_analytic_jac && (linsolve_needs_jac || alg_wants_jac)
         sd = sparsity_detection_alg(f, alg.ad)
-        jac_cache = iip ? sparse_jacobian_cache(alg.ad, sd, uf, fu, u) :
-                    sparse_jacobian_cache(alg.ad, sd, uf, u; fx = fu)
+        ad = alg.ad
+        jac_cache = iip ? sparse_jacobian_cache(ad, sd, uf, fu, _maybe_mutable(u, ad)) :
+                    sparse_jacobian_cache(ad, sd, uf, _maybe_mutable(u, ad); fx = fu)
     else
         jac_cache = nothing
     end
@@ -78,7 +83,7 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
         end
     end
 
-    du = zero(u)
+    du = _mutable_zero(u)
     linprob = LinearProblem(J, _vec(fu); u0 = _vec(du))
 
     weight = similar(u)
@@ -90,3 +95,11 @@ function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u, p,
 
     return uf, linsolve, J, fu, jac_cache, du
 end
+
+## Special Handling for Scalars
+function jacobian_caches(alg::AbstractNonlinearSolveAlgorithm, f, u::Number, p,
+    ::Val{false})
+    # NOTE: Scalar `u` assumes scalar output from `f`
+    uf = JacobianWrapper(f, p)
+    return uf, nothing, u, nothing, nothing, u
+end
diff --git a/src/levenberg.jl b/src/levenberg.jl
index 15956c7df..6265eba3f 100644
--- a/src/levenberg.jl
+++ b/src/levenberg.jl
@@ -1,335 +1,335 @@
-# """
-#     LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-#         precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
-#         damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
-#         finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
-#         b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
+"""
+    LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+        precs = DEFAULT_PRECS, damping_initial::Real = 1.0,
+        damping_increase_factor::Real = 2.0, damping_decrease_factor::Real = 3.0,
+        finite_diff_step_geodesic::Real = 0.1, α_geodesic::Real = 0.75,
+        b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8, adkwargs...)
 
-# An advanced Levenberg-Marquardt implementation with the improvements suggested in the
-# [paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
-# algorithm for nonlinear least-squares minimization". Designed for large-scale and
-# numerically-difficult nonlinear systems.
+An advanced Levenberg-Marquardt implementation with the improvements suggested in the
+[paper](https://arxiv.org/abs/1201.5885) "Improvements to the Levenberg-Marquardt
+algorithm for nonlinear least-squares minimization". Designed for large-scale and
+numerically-difficult nonlinear systems.
 
-# ### Keyword Arguments
+### Keyword Arguments
 
-#   - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
-#       ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
-#       `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
-#   - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
-#     then the Jacobian will not be constructed and instead direct Jacobian-vector products
-#     `J*v` are computed using forward-mode automatic differentiation or finite differencing
-#     tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
-#     for example for a preconditioner, `concrete_jac = true` can be passed in order to force
-#     the construction of the Jacobian.
-#   - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
-#     linear solves within the Newton method. Defaults to `nothing`, which means it uses the
-#     LinearSolve.jl default algorithm choice. For more information on available algorithm
-#     choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-#   - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
-#     preconditioners. For more information on specifying preconditioners for LinearSolve
-#     algorithms, consult the
-#     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
-#   - `damping_initial`: the starting value for the damping factor. The damping factor is
-#     inversely proportional to the step size. The damping factor is adjusted during each
-#     iteration. Defaults to `1.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `damping_increase_factor`: the factor by which the damping is increased if a step is
-#     rejected. Defaults to `2.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
-#     accepted. Defaults to `3.0`. For more details, see section 2.1 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
-#     the geodesic acceleration. Defaults to `0.1` which means that the step size is
-#     approximately 10% of the first-order step. For more details, see section 3 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
-#     geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
-#     that acceptable steps meet the condition
-#     ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
-#     acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
-#     path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
-#     is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
-#     effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
-#     choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
-#     and reject all steps that increase the cost. Although this is a natural and safe choice,
-#     it is often not the most efficient. Therefore downhill moves are always accepted, but
-#     uphill moves are only conditionally accepted. To decide whether an uphill move will be
-#     accepted at each iteration ``i``, we compute
-#     ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
-#     between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
-#     step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
-#     specify, uphill moves are accepted if
-#     ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
-#     iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
-#     allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
-#     will be accepted. Defaults to `1.0`. For more details, see section 4 of
-#     [this paper](https://arxiv.org/abs/1201.5885).
-#   - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
-#     `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
-#     where `J` is the Jacobian. It is suggested by
-#     [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
-#     `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
-# """
-# @concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
-#     ad::AD
-#     linsolve
-#     precs
-#     damping_initial::T
-#     damping_increase_factor::T
-#     damping_decrease_factor::T
-#     finite_diff_step_geodesic::T
-#     α_geodesic::T
-#     b_uphill::T
-#     min_damping_D::T
-# end
+  - `autodiff`: determines the backend used for the Jacobian. Note that this argument is
+      ignored if an analytical Jacobian is passed, as that will be used instead. Defaults to
+      `AutoForwardDiff()`. Valid choices are types from ADTypes.jl.
+  - `concrete_jac`: whether to build a concrete Jacobian. If a Krylov-subspace method is used,
+    then the Jacobian will not be constructed and instead direct Jacobian-vector products
+    `J*v` are computed using forward-mode automatic differentiation or finite differencing
+    tricks (without ever constructing the Jacobian). However, if the Jacobian is still needed,
+    for example for a preconditioner, `concrete_jac = true` can be passed in order to force
+    the construction of the Jacobian.
+  - `linsolve`: the [LinearSolve.jl](https://github.com/SciML/LinearSolve.jl) used for the
+    linear solves within the Newton method. Defaults to `nothing`, which means it uses the
+    LinearSolve.jl default algorithm choice. For more information on available algorithm
+    choices, see the [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `precs`: the choice of preconditioners for the linear solver. Defaults to using no
+    preconditioners. For more information on specifying preconditioners for LinearSolve
+    algorithms, consult the
+    [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
+  - `damping_initial`: the starting value for the damping factor. The damping factor is
+    inversely proportional to the step size. The damping factor is adjusted during each
+    iteration. Defaults to `1.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_increase_factor`: the factor by which the damping is increased if a step is
+    rejected. Defaults to `2.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `damping_decrease_factor`: the factor by which the damping is decreased if a step is
+    accepted. Defaults to `3.0`. For more details, see section 2.1 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `finite_diff_step_geodesic`: the step size used for finite differencing used to calculate
+    the geodesic acceleration. Defaults to `0.1` which means that the step size is
+    approximately 10% of the first-order step. For more details, see section 3 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `α_geodesic`: a factor that determines if a step is accepted or rejected. To incorporate
+    geodesic acceleration as an addition to the Levenberg-Marquardt algorithm, it is necessary
+    that acceptable steps meet the condition
+    ``\\frac{2||a||}{||v||} \\le \\alpha_{\\text{geodesic}}``, where ``a`` is the geodesic
+    acceleration, ``v`` is the Levenberg-Marquardt algorithm's step (velocity along a geodesic
+    path) and `α_geodesic` is some number of order `1`. For most problems `α_geodesic = 0.75`
+    is a good value but for problems where convergence is difficult `α_geodesic = 0.1` is an
+    effective choice. Defaults to `0.75`. For more details, see section 3, equation (15) of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `b_uphill`: a factor that determines if a step is accepted or rejected. The standard
+    choice in the Levenberg-Marquardt method is to accept all steps that decrease the cost
+    and reject all steps that increase the cost. Although this is a natural and safe choice,
+    it is often not the most efficient. Therefore downhill moves are always accepted, but
+    uphill moves are only conditionally accepted. To decide whether an uphill move will be
+    accepted at each iteration ``i``, we compute
+    ``\\beta_i = \\cos(v_{\\text{new}}, v_{\\text{old}})``, which denotes the cosine angle
+    between the proposed velocity ``v_{\\text{new}}`` and the velocity of the last accepted
+    step ``v_{\\text{old}}``. The idea is to accept uphill moves if the angle is small. To
+    specify, uphill moves are accepted if
+    ``(1-\\beta_i)^{b_{\\text{uphill}}} C_{i+1} \\le C_i``, where ``C_i`` is the cost at
+    iteration ``i``. Reasonable choices for `b_uphill` are `1.0` or `2.0`, with `b_uphill=2.0`
+    allowing higher uphill moves than `b_uphill=1.0`. When `b_uphill=0.0`, no uphill moves
+    will be accepted. Defaults to `1.0`. For more details, see section 4 of
+    [this paper](https://arxiv.org/abs/1201.5885).
+  - `min_damping_D`: the minimum value of the damping terms in the diagonal damping matrix
+    `DᵀD`, where `DᵀD` is given by the largest diagonal entries of `JᵀJ` yet encountered,
+    where `J` is the Jacobian. It is suggested by
+    [this paper](https://arxiv.org/abs/1201.5885) to use a minimum value of the elements in
+    `DᵀD` to prevent the damping from being too small. Defaults to `1e-8`.
+"""
+@concrete struct LevenbergMarquardt{CJ, AD, T} <: AbstractNewtonAlgorithm{CJ, AD}
+    ad::AD
+    linsolve
+    precs
+    damping_initial::T
+    damping_increase_factor::T
+    damping_decrease_factor::T
+    finite_diff_step_geodesic::T
+    α_geodesic::T
+    b_uphill::T
+    min_damping_D::T
+end
 
-# function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
-#     precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
-#     damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
-#     α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
-#     adkwargs...)
-#     ad = default_adargs_to_adtype(adkwargs...)
-#     return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
-#         damping_initial, damping_increase_factor, damping_decrease_factor,
-#         finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
-# end
+function LevenbergMarquardt(; concrete_jac = nothing, linsolve = nothing,
+    precs = DEFAULT_PRECS, damping_initial::Real = 1.0, damping_increase_factor::Real = 2.0,
+    damping_decrease_factor::Real = 3.0, finite_diff_step_geodesic::Real = 0.1,
+    α_geodesic::Real = 0.75, b_uphill::Real = 1.0, min_damping_D::AbstractFloat = 1e-8,
+    adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
+    return LevenbergMarquardt{_unwrap_val(concrete_jac)}(ad, linsolve, precs,
+        damping_initial, damping_increase_factor, damping_decrease_factor,
+        finite_diff_step_geodesic, α_geodesic, b_uphill, min_damping_D)
+end
 
-# @concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
-#     f
-#     alg
-#     u::uType
-#     fu1
-#     fu2
-#     du
-#     p
-#     uf
-#     linsolve
-#     J::jType
-#     jac_cache
-#     force_stop::Bool
-#     maxiters::Int
-#     internalnorm
-#     retcode::ReturnCode.T
-#     abstol
-#     prob
-#     DᵀD
-#     JᵀJ::jType
-#     λ::λType
-#     λ_factor::λType
-#     damping_increase_factor::λType
-#     damping_decrease_factor::λType
-#     h::λType
-#     α_geodesic::λType
-#     b_uphill::λType
-#     min_damping_D::λType
-#     v::uType
-#     a::uType
-#     tmp_vec::uType
-#     v_old::uType
-#     norm_v_old::lossType
-#     δ::uType
-#     loss_old::lossType
-#     make_new_J::Bool
-#     fu_tmp
-#     mat_tmp::jType
-#     stats::NLStats
-# end
+@concrete mutable struct LevenbergMarquardtCache{iip, uType, jType, λType, lossType}
+    f
+    alg
+    u::uType
+    fu1
+    fu2
+    du
+    p
+    uf
+    linsolve
+    J::jType
+    jac_cache
+    force_stop::Bool
+    maxiters::Int
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    DᵀD
+    JᵀJ::jType
+    λ::λType
+    λ_factor::λType
+    damping_increase_factor::λType
+    damping_decrease_factor::λType
+    h::λType
+    α_geodesic::λType
+    b_uphill::λType
+    min_damping_D::λType
+    v::uType
+    a::uType
+    tmp_vec::uType
+    v_old::uType
+    norm_v_old::lossType
+    δ::uType
+    loss_old::lossType
+    make_new_J::Bool
+    fu_tmp
+    mat_tmp::jType
+    stats::NLStats
+end
 
-# isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
+isinplace(::LevenbergMarquardtCache{iip}) where {iip} = iip
 
-# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
-#     args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
-#     kwargs...) where {uType, iip}
-#     @unpack f, u0, p = prob
-#     u = alias_u0 ? u0 : deepcopy(u0)
-#     if iip
-#         fu1 = zero(u)  # TODO: Use Prototype
-#         f(fu1, u, p)
-#     else
-#         fu1 = f(u, p)
-#     end
-#     uf, linsolve, J, fu2, jac_cache = jacobian_caches(alg, f, u, p, Val(iip))
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::LevenbergMarquardt,
+    args...; alias_u0 = false, maxiters = 1000, abstol = 1e-6, internalnorm = DEFAULT_NORM,
+    kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    if iip
+        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu1, u, p)
+    else
+        fu1 = f(u, p)
+    end
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
 
-#     λ = convert(eltype(u), alg.damping_initial)
-#     λ_factor = convert(eltype(u), alg.damping_increase_factor)
-#     damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
-#     damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
-#     h = convert(eltype(u), alg.finite_diff_step_geodesic)
-#     α_geodesic = convert(eltype(u), alg.α_geodesic)
-#     b_uphill = convert(eltype(u), alg.b_uphill)
-#     min_damping_D = convert(eltype(u), alg.min_damping_D)
+    λ = convert(eltype(u), alg.damping_initial)
+    λ_factor = convert(eltype(u), alg.damping_increase_factor)
+    damping_increase_factor = convert(eltype(u), alg.damping_increase_factor)
+    damping_decrease_factor = convert(eltype(u), alg.damping_decrease_factor)
+    h = convert(eltype(u), alg.finite_diff_step_geodesic)
+    α_geodesic = convert(eltype(u), alg.α_geodesic)
+    b_uphill = convert(eltype(u), alg.b_uphill)
+    min_damping_D = convert(eltype(u), alg.min_damping_D)
 
-#     if u isa Number
-#         DᵀD = min_damping_D
-#     else
-#         d = similar(u)
-#         d .= min_damping_D
-#         DᵀD = Diagonal(d)
-#     end
+    if u isa Number
+        DᵀD = min_damping_D
+    else
+        d = similar(u)
+        d .= min_damping_D
+        DᵀD = Diagonal(d)
+    end
 
-#     loss = internalnorm(fu1)
-#     JᵀJ = zero(J)
-#     v = zero(u)
-#     a = zero(u)
-#     tmp_vec = zero(u)
-#     v_old = zero(u)
-#     δ = zero(u)
-#     make_new_J = true
-#     fu_tmp = zero(fu1)
-#     mat_tmp = zero(J)
+    loss = internalnorm(fu1)
+    JᵀJ = zero(J)
+    v = zero(u)
+    a = zero(u)
+    tmp_vec = zero(u)
+    v_old = zero(u)
+    δ = zero(u)
+    make_new_J = true
+    fu_tmp = zero(fu1)
+    mat_tmp = zero(J)
 
-#     return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, zero(u), p, uf, linsolve, J,
-#         jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
-#         JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
-#         b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
-#         mat_tmp, NLStats(1, 0, 0, 0, 0))
-# end
+    return LevenbergMarquardtCache{iip}(f, alg, u, fu1, fu2, du, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob, DᵀD,
+        JᵀJ, λ, λ_factor, damping_increase_factor, damping_decrease_factor, h, α_geodesic,
+        b_uphill, min_damping_D, v, a, tmp_vec, v_old, loss, δ, loss, make_new_J, fu_tmp,
+        mat_tmp, NLStats(1, 0, 0, 0, 0))
+end
 
-# function perform_step!(cache::LevenbergMarquardtCache{true})
-#     @unpack fu1, f, make_new_J = cache
-#     if iszero(fu1)
-#         cache.force_stop = true
-#         return nothing
-#     end
+function perform_step!(cache::LevenbergMarquardtCache{true})
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
+        cache.force_stop = true
+        return nothing
+    end
 
-#     if make_new_J
-#         jacobian!!(cache.J, cache)
-#         mul!(cache.JᵀJ, cache.J', cache.J)
-#         cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-#         cache.make_new_J = false
-#         cache.stats.njacs += 1
-#     end
-#     @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
+    if make_new_J
+        jacobian!!(cache.J, cache)
+        mul!(cache.JᵀJ, cache.J', cache.J)
+        cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+        cache.make_new_J = false
+        cache.stats.njacs += 1
+    end
+    @unpack u, p, λ, JᵀJ, DᵀD, J, alg, linsolve = cache
 
-#     # Usual Levenberg-Marquardt step ("velocity").
-#     # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
-#     mul!(cache.fu_tmp, J', fu1)
-#     @. cache.mat_tmp = JᵀJ + λ * DᵀD
-#     linres = dolinsolve(alg.precs, linsolve, A = cache.mat_tmp, b = _vec(cache.fu_tmp),
-#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     @. cache.v = -cache.du
+    # Usual Levenberg-Marquardt step ("velocity").
+    # The following lines do: cache.v = -cache.mat_tmp \ cache.fu_tmp
+    mul!(cache.fu_tmp, J', fu1)
+    @. cache.mat_tmp = JᵀJ + λ * DᵀD
+    linres = dolinsolve(alg.precs, linsolve; A = cache.mat_tmp, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    @. cache.v = -cache.du
 
-#     # Geodesic acceleration (step_size = v + a / 2).
-#     @unpack v, α_geodesic, h = cache
-#     f(cache.fu_tmp, u .+ h .* v, p)
+    # Geodesic acceleration (step_size = v + a / 2).
+    @unpack v, α_geodesic, h = cache
+    f(cache.fu_tmp, u .+ h .* v, p)
 
-#     # The following lines do: cache.a = -J \ cache.fu_tmp
-#     mul!(cache.du, J, v)
-#     @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
-#     linres = dolinsolve(alg.precs, linsolve, A = J, b = _vec(cache.fu_tmp),
-#         linu = _vec(cache.du), p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     @. cache.a = -cache.du
-#     cache.stats.nsolve += 2
-#     cache.stats.nfactors += 2
+    # The following lines do: cache.a = -J \ cache.fu_tmp
+    mul!(cache.du, J, v)
+    @. cache.fu_tmp = (2 / h) * ((cache.fu_tmp - fu1) / h - cache.du)
+    linres = dolinsolve(alg.precs, linsolve; A = J, b = _vec(cache.fu_tmp),
+        linu = _vec(cache.du), p = p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    @. cache.a = -cache.du
+    cache.stats.nsolve += 2
+    cache.stats.nfactors += 2
 
-#     # Require acceptable steps to satisfy the following condition.
-#     norm_v = norm(v)
-#     if (2 * norm(cache.a) / norm_v) < α_geodesic
-#         @. cache.δ = v + cache.a / 2
-#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-#         f(cache.fu_tmp, u .+ δ, p)
-#         cache.stats.nf += 1
-#         loss = cache.internalnorm(cache.fu_tmp)
+    # Require acceptable steps to satisfy the following condition.
+    norm_v = norm(v)
+    if (2 * norm(cache.a) / norm_v) < α_geodesic
+        @. cache.δ = v + cache.a / 2
+        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+        f(cache.fu_tmp, u .+ δ, p)
+        cache.stats.nf += 1
+        loss = cache.internalnorm(cache.fu_tmp)
 
-#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-#         β = dot(v, v_old) / (norm_v * norm_v_old)
-#         if (1 - β)^b_uphill * loss ≤ loss_old
-#             # Accept step.
-#             cache.u .+= δ
-#             if loss < cache.abstol
-#                 cache.force_stop = true
-#                 return nothing
-#             end
-#             cache.fu1 .= cache.fu_tmp
-#             cache.v_old .= v
-#             cache.norm_v_old = norm_v
-#             cache.loss_old = loss
-#             cache.λ_factor = 1 / cache.damping_decrease_factor
-#             cache.make_new_J = true
-#         end
-#     end
-#     cache.λ *= cache.λ_factor
-#     cache.λ_factor = cache.damping_increase_factor
-#     return nothing
-# end
+        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        β = dot(v, v_old) / (norm_v * norm_v_old)
+        if (1 - β)^b_uphill * loss ≤ loss_old
+            # Accept step.
+            cache.u .+= δ
+            if loss < cache.abstol
+                cache.force_stop = true
+                return nothing
+            end
+            cache.fu1 .= cache.fu_tmp
+            cache.v_old .= v
+            cache.norm_v_old = norm_v
+            cache.loss_old = loss
+            cache.λ_factor = 1 / cache.damping_decrease_factor
+            cache.make_new_J = true
+        end
+    end
+    cache.λ *= cache.λ_factor
+    cache.λ_factor = cache.damping_increase_factor
+    return nothing
+end
 
-# function perform_step!(cache::LevenbergMarquardtCache{false})
-#     @unpack fu1, f, make_new_J = cache
-#     if iszero(fu1)
-#         cache.force_stop = true
-#         return nothing
-#     end
+function perform_step!(cache::LevenbergMarquardtCache{false})
+    @unpack fu1, f, make_new_J = cache
+    if _iszero(fu1)
+        cache.force_stop = true
+        return nothing
+    end
 
-#     if make_new_J
-#         cache.J = jacobian!!(cache.J, cache)
-#         cache.JᵀJ = cache.J' * cache.J
-#         if cache.JᵀJ isa Number
-#             cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
-#         else
-#             cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
-#         end
-#         cache.make_new_J = false
-#         cache.stats.njacs += 1
-#     end
-#     @unpack u, p, λ, JᵀJ, DᵀD, J = cache
+    if make_new_J
+        cache.J = jacobian!!(cache.J, cache)
+        cache.JᵀJ = cache.J' * cache.J
+        if cache.JᵀJ isa Number
+            cache.DᵀD = max(cache.DᵀD, cache.JᵀJ)
+        else
+            cache.DᵀD .= max.(cache.DᵀD, Diagonal(cache.JᵀJ))
+        end
+        cache.make_new_J = false
+        cache.stats.njacs += 1
+    end
+    @unpack u, p, λ, JᵀJ, DᵀD, J = cache
 
-#     # Usual Levenberg-Marquardt step ("velocity").
-#     cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
+    # Usual Levenberg-Marquardt step ("velocity").
+    cache.v = -(JᵀJ + λ * DᵀD) \ (J' * fu1)
 
-#     @unpack v, h, α_geodesic = cache
-#     # Geodesic acceleration (step_size = v + a / 2).
-#     cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
+    @unpack v, h, α_geodesic = cache
+    # Geodesic acceleration (step_size = v + a / 2).
+    cache.a = -J \ ((2 / h) .* ((f(u .+ h .* v, p) .- fu1) ./ h .- J * v))
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
 
-#     # Require acceptable steps to satisfy the following condition.
-#     norm_v = norm(v)
-#     if (2 * norm(cache.a) / norm_v) < α_geodesic
-#         cache.δ = v .+ cache.a ./ 2
-#         @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
-#         fu_new = f(u .+ δ, p)
-#         cache.stats.nf += 1
-#         loss = cache.internalnorm(fu_new)
+    # Require acceptable steps to satisfy the following condition.
+    norm_v = norm(v)
+    if (2 * norm(cache.a) / norm_v) < α_geodesic
+        cache.δ = v .+ cache.a ./ 2
+        @unpack δ, loss_old, norm_v_old, v_old, b_uphill = cache
+        fu_new = f(u .+ δ, p)
+        cache.stats.nf += 1
+        loss = cache.internalnorm(fu_new)
 
-#         # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
-#         β = dot(v, v_old) / (norm_v * norm_v_old)
-#         if (1 - β)^b_uphill * loss ≤ loss_old
-#             # Accept step.
-#             cache.u += δ
-#             if loss < cache.abstol
-#                 cache.force_stop = true
-#                 return nothing
-#             end
-#             cache.fu1 = fu_new
-#             cache.v_old = v
-#             cache.norm_v_old = norm_v
-#             cache.loss_old = loss
-#             cache.λ_factor = 1 / cache.damping_decrease_factor
-#             cache.make_new_J = true
-#         end
-#     end
-#     cache.λ *= cache.λ_factor
-#     cache.λ_factor = cache.damping_increase_factor
-#     return nothing
-# end
+        # Condition to accept uphill steps (evaluates to `loss ≤ loss_old` in iteration 1).
+        β = dot(v, v_old) / (norm_v * norm_v_old)
+        if (1 - β)^b_uphill * loss ≤ loss_old
+            # Accept step.
+            cache.u += δ
+            if loss < cache.abstol
+                cache.force_stop = true
+                return nothing
+            end
+            cache.fu1 = fu_new
+            cache.v_old = v
+            cache.norm_v_old = norm_v
+            cache.loss_old = loss
+            cache.λ_factor = 1 / cache.damping_decrease_factor
+            cache.make_new_J = true
+        end
+    end
+    cache.λ *= cache.λ_factor
+    cache.λ_factor = cache.damping_increase_factor
+    return nothing
+end
 
-# function SciMLBase.solve!(cache::LevenbergMarquardtCache)
-#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters
-#         perform_step!(cache)
-#         cache.stats.nsteps += 1
-#     end
+function SciMLBase.solve!(cache::LevenbergMarquardtCache)
+    while !cache.force_stop && cache.stats.nsteps < cache.maxiters
+        perform_step!(cache)
+        cache.stats.nsteps += 1
+    end
 
-#     if cache.stats.nsteps == cache.maxiters
-#         cache.retcode = ReturnCode.MaxIters
-#     else
-#         cache.retcode = ReturnCode.Success
-#     end
+    if cache.stats.nsteps == cache.maxiters
+        cache.retcode = ReturnCode.MaxIters
+    else
+        cache.retcode = ReturnCode.Success
+    end
 
-#     return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
-#         cache.retcode, cache.stats)
-# end
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu1;
+        cache.retcode, cache.stats)
+end
diff --git a/src/raphson.jl b/src/raphson.jl
index 9f7c1fb87..33d12c4ba 100644
--- a/src/raphson.jl
+++ b/src/raphson.jl
@@ -72,7 +72,7 @@ function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::NewtonRaphson
         fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
         f(fu1, u, p)
     else
-        fu1 = f.resid_prototype === nothing ? f(u, p) : f.resid_prototype
+        fu1 = _mutable(f(u, p))
     end
     uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
 
@@ -101,15 +101,19 @@ function perform_step!(cache::NewtonRaphsonCache{true})
 end
 
 function perform_step!(cache::NewtonRaphsonCache{false})
-    @unpack u, fu1, f, p, alg, linsolve, du = cache
+    @unpack u, fu1, f, p, alg, linsolve = cache
 
     cache.J = jacobian!!(cache.J, cache)
     # u = u - J \ fu
-    linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1), linu = _vec(du),
-        p, reltol = cache.abstol)
-    cache.linsolve = linres.cache
-    @. u = u - du
-    cache.fu1 = f(u, p)
+    if linsolve === nothing
+        cache.du = fu1 / cache.J
+    else
+        linres = dolinsolve(alg.precs, linsolve; A = cache.J, b = _vec(fu1),
+            linu = _vec(cache.du), p, reltol = cache.abstol)
+        cache.linsolve = linres.cache
+    end
+    cache.u = @. u - cache.du  # `u` might not support mutation
+    cache.fu1 = f(cache.u, p)
 
     cache.internalnorm(fu1) < cache.abstol && (cache.force_stop = true)
     cache.stats.nf += 1
diff --git a/src/trustRegion.jl b/src/trustRegion.jl
index c43b86699..41ccb994e 100644
--- a/src/trustRegion.jl
+++ b/src/trustRegion.jl
@@ -8,7 +8,7 @@ scheme are provided below.
 
 ## Using `RadiusUpdateSchemes`
 
-`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl), 
+`RadiusUpdateSchemes` uses the standard EnumX interface (https://github.com/fredrikekre/EnumX.jl),
 and hence inherits all properties of being an EnumX, including the type of each constituent enum
 states as `RadiusUpdateSchemes.T`. Simply put the desired scheme as follows:
 `TrustRegion(radius_update_scheme = your desired update scheme)`. For example,
@@ -99,7 +99,7 @@ for large-scale and numerically-difficult nonlinear systems.
     algorithms, consult the
     [LinearSolve.jl documentation](https://docs.sciml.ai/LinearSolve/stable/).
   - `radius_update_scheme`: the choice of radius update scheme to be used. Defaults to `RadiusUpdateSchemes.Simple`
-    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`, 
+    which follows the conventional approach. Other available schemes are `RadiusUpdateSchemes.Hei`,
     `RadiusUpdateSchemes.Yuan`, `RadiusUpdateSchemes.Bastin`, `RadiusUpdateSchemes.Fan`. These schemes
     have the trust region radius converging to zero that is seen to improve convergence. For more details, see the
     [Yuan, Yx](https://link.springer.com/article/10.1007/s10107-015-0893-2#Sec4).
@@ -149,471 +149,454 @@ function TrustRegion(; concrete_jac = nothing, linsolve = nothing, precs = DEFAU
     step_threshold::Real = 1 // 10, shrink_threshold::Real = 1 // 4,
     expand_threshold::Real = 3 // 4, shrink_factor::Real = 1 // 4,
     expand_factor::Real = 2 // 1, max_shrink_times::Int = 32, adkwargs...)
-    ad = default_adargs_to_adtype(adkwargs...)
+    ad = default_adargs_to_adtype(; adkwargs...)
     return TrustRegion{_unwrap_val(concrete_jac)}(ad, linsolve, precs, radius_update_scheme,
         max_trust_radius, initial_trust_radius, step_threshold, shrink_threshold,
         expand_threshold, shrink_factor, expand_factor, max_shrink_times)
 end
 
-# @concrete mutable struct TrustRegionCache{iip}
-#     f
-#     alg
-#     u_prev::uType
-#     u::uType
-#     fu_prev::resType
-#     fu::resType
-#     p
-#     uf
-#     linsolve
-#     J::jType
-#     jac_cache
-#     force_stop::Bool
-#     maxiters::Int
-#     internalnorm
-#     retcode::ReturnCode.T
-#     abstol
-#     prob
-#     radius_update_scheme::RadiusUpdateSchemes.T
-#     trust_r::trustType
-#     max_trust_r::trustType
-#     step_threshold
-#     shrink_threshold::trustType
-#     expand_threshold::trustType
-#     shrink_factor::trustType
-#     expand_factor::trustType
-#     loss::floatType
-#     loss_new::floatType
-#     H::jType
-#     g::resType
-#     shrink_counter::Int
-#     step_size
-#     u_tmp
-#     fu_new::resType
-#     make_new_J::Bool
-#     r::floatType
-#     p1::floatType
-#     p2::floatType
-#     p3::floatType
-#     p4::floatType
-#     ϵ::floatType
-#     stats::NLStats
-# end
-
-# function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion,
-#     args...;
-#     alias_u0 = false,
-#     maxiters = 1000,
-#     abstol = 1e-8,
-#     internalnorm = DEFAULT_NORM,
-#     kwargs...) where {uType, iip}
-#     if alias_u0
-#         u = prob.u0
-#     else
-#         u = deepcopy(prob.u0)
-#     end
-#     u_prev = zero(u)
-#     f = prob.f
-#     p = prob.p
-#     if iip
-#         fu = zero(u)
-#         f(fu, u, p)
-#     else
-#         fu = f(u, p)
-#     end
-#     fu_prev = zero(fu)
-
-#     loss = get_loss(fu)
-#     uf, linsolve, J, u_tmp, jac_config = jacobian_caches(alg, f, u, p, Val(iip))
-
-#     radius_update_scheme = alg.radius_update_scheme
-#     max_trust_radius = convert(eltype(u), alg.max_trust_radius)
-#     initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
-#     step_threshold = convert(eltype(u), alg.step_threshold)
-#     shrink_threshold = convert(eltype(u), alg.shrink_threshold)
-#     expand_threshold = convert(eltype(u), alg.expand_threshold)
-#     shrink_factor = convert(eltype(u), alg.shrink_factor)
-#     expand_factor = convert(eltype(u), alg.expand_factor)
-#     # Set default trust region radius if not specified
-#     if iszero(max_trust_radius)
-#         max_trust_radius = convert(eltype(u), max(norm(fu), maximum(u) - minimum(u)))
-#     end
-#     if iszero(initial_trust_radius)
-#         initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
-#     end
-
-#     loss_new = loss
-#     H = ArrayInterface.undefmatrix(u)
-#     g = zero(fu)
-#     shrink_counter = 0
-#     step_size = zero(u)
-#     fu_new = zero(fu)
-#     make_new_J = true
-#     r = loss
-
-#     # Parameters for the Schemes
-#     p1 = convert(eltype(u), 0.0)
-#     p2 = convert(eltype(u), 0.0)
-#     p3 = convert(eltype(u), 0.0)
-#     p4 = convert(eltype(u), 0.0)
-#     ϵ = convert(eltype(u), 1.0e-8)
-#     if radius_update_scheme === RadiusUpdateSchemes.Hei
-#         step_threshold = convert(eltype(u), 0.0)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.25)
-#         p1 = convert(eltype(u), 5.0) # M
-#         p2 = convert(eltype(u), 0.1) # β
-#         p3 = convert(eltype(u), 0.15) # γ1
-#         p4 = convert(eltype(u), 0.15) # γ2
-#         initial_trust_radius = convert(eltype(u), 1.0)
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-#         step_threshold = convert(eltype(u), 0.0001)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.25)
-#         p1 = convert(eltype(u), 2.0) # μ
-#         p2 = convert(eltype(u), 1 / 6) # c5
-#         p3 = convert(eltype(u), 6.0) # c6
-#         p4 = convert(eltype(u), 0.0)
-#         if iip
-#             auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-#         else
-#             if isa(u, Number)
-#                 g = ForwardDiff.derivative(x -> f(x, p), u)
-#             else
-#                 g = auto_jacvec(x -> f(x, p), u, fu)
-#             end
-#         end
-#         initial_trust_radius = convert(eltype(u), p1 * norm(g))
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-#         step_threshold = convert(eltype(u), 0.0001)
-#         shrink_threshold = convert(eltype(u), 0.25)
-#         expand_threshold = convert(eltype(u), 0.75)
-#         p1 = convert(eltype(u), 0.1) # μ
-#         p2 = convert(eltype(u), 1 / 4) # c5
-#         p3 = convert(eltype(u), 12) # c6
-#         p4 = convert(eltype(u), 1.0e18) # M
-#         initial_trust_radius = convert(eltype(u), p1 * (norm(fu)^0.99))
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-#         step_threshold = convert(eltype(u), 0.05)
-#         shrink_threshold = convert(eltype(u), 0.05)
-#         expand_threshold = convert(eltype(u), 0.9)
-#         p1 = convert(eltype(u), 2.5)  #alpha_1
-#         p2 = convert(eltype(u), 0.25) # alpha_2
-#         p3 = convert(eltype(u), 0) # not required
-#         p4 = convert(eltype(u), 0) # not required
-#         initial_trust_radius = convert(eltype(u), 1.0)
-#     end
-
-#     return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu, p, uf, linsolve, J,
-#         jac_config,
-#         false, maxiters, internalnorm,
-#         ReturnCode.Default, abstol, prob, radius_update_scheme,
-#         initial_trust_radius,
-#         max_trust_radius, step_threshold, shrink_threshold,
-#         expand_threshold, shrink_factor, expand_factor, loss,
-#         loss_new, H, g, shrink_counter, step_size, u_tmp, fu_new,
-#         make_new_J, r, p1, p2, p3, p4, ϵ, NLStats(1, 0, 0, 0, 0))
-# end
-
-# function perform_step!(cache::TrustRegionCache{true})
-#     @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
-#     if cache.make_new_J
-#         jacobian!(J, cache)
-#         mul!(cache.H, J, J)
-#         mul!(cache.g, J, fu)
-#         cache.stats.njacs += 1
-#     end
-
-#     linres = dolinsolve(alg.precs, linsolve, A = cache.H, b = _vec(cache.g),
-#         linu = _vec(u_tmp),
-#         p = p, reltol = cache.abstol)
-#     cache.linsolve = linres.cache
-#     cache.u_tmp .= -1 .* u_tmp
-#     dogleg!(cache)
-
-#     # Compute the potentially new u
-#     cache.u_tmp .= u .+ cache.step_size
-#     f(cache.fu_new, cache.u_tmp, p)
-#     trust_region_step!(cache)
-#     cache.stats.nf += 1
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
-#     return nothing
-# end
-
-# function perform_step!(cache::TrustRegionCache{false})
-#     @unpack make_new_J, fu, f, u, p = cache
-
-#     if make_new_J
-#         J = jacobian(cache, f)
-#         cache.H = J * J
-#         cache.g = J * fu
-#         cache.stats.njacs += 1
-#     end
-
-#     @unpack g, H = cache
-#     # Compute the Newton step.
-#     cache.u_tmp = -H \ g
-#     dogleg!(cache)
-
-#     # Compute the potentially new u
-#     cache.u_tmp = u .+ cache.step_size
-#     cache.fu_new = f(cache.u_tmp, p)
-#     trust_region_step!(cache)
-#     cache.stats.nf += 1
-#     cache.stats.nsolve += 1
-#     cache.stats.nfactors += 1
-#     return nothing
-# end
-
-# function retrospective_step!(cache::TrustRegionCache{true})
-#     @unpack J, fu_prev, fu, u_prev, u = cache
-#     jacobian!(J, cache)
-#     mul!(cache.H, J, J)
-#     mul!(cache.g, J, fu)
-#     cache.stats.njacs += 1
-#     @unpack H, g, step_size = cache
-
-#     return -(get_loss(fu_prev) - get_loss(fu)) /
-#            (step_size' * g + step_size' * H * step_size / 2)
-# end
-
-# function retrospective_step!(cache::TrustRegionCache{false})
-#     @unpack J, fu_prev, fu, u_prev, u, f = cache
-#     J = jacobian(cache, f)
-#     cache.H = J * J
-#     cache.g = J * fu
-#     cache.stats.njacs += 1
-#     @unpack H, g, step_size = cache
-
-#     return -(get_loss(fu_prev) - get_loss(fu)) /
-#            (step_size' * g + step_size' * H * step_size / 2)
-# end
-
-# function trust_region_step!(cache::TrustRegionCache)
-#     @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
-#     cache.loss_new = get_loss(fu_new)
-
-#     # Compute the ratio of the actual reduction to the predicted reduction.
-#     cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
-#     @unpack r = cache
-
-#     if radius_update_scheme === RadiusUpdateSchemes.Simple
-#         # Update the trust region radius.
-#         if r < cache.shrink_threshold
-#             cache.trust_r *= cache.shrink_factor
-#             cache.shrink_counter += 1
-#         else
-#             cache.shrink_counter = 0
-#         end
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-
-#             # Update the trust region radius.
-#             if r > cache.expand_threshold
-#                 cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
-#             end
-
-#             cache.make_new_J = true
-#         else
-#             # No need to make a new J, no step was taken, so we try again with a smaller trust_r
-#             cache.make_new_J = false
-#         end
-
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-#             cache.force_stop = true
-#         end
-
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Hei
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-#         # Hei's radius update scheme
-#         @unpack shrink_threshold, p1, p2, p3, p4 = cache
-#         if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
-#            cache.trust_r
-#             cache.shrink_counter += 1
-#         else
-#             cache.shrink_counter = 0
-#         end
-#         cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
-#                         cache.internalnorm(step_size)
-
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
-#         if r < cache.shrink_threshold
-#             cache.p1 = cache.p2 * cache.p1
-#             cache.shrink_counter += 1
-#         elseif r >= cache.expand_threshold &&
-#                cache.internalnorm(step_size) > cache.trust_r / 2
-#             cache.p1 = cache.p3 * cache.p1
-#             cache.shrink_counter = 0
-#         end
-
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-
-#         @unpack p1 = cache
-#         cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-#         #Fan's update scheme
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Fan
-#         if r < cache.shrink_threshold
-#             cache.p1 *= cache.p2
-#             cache.shrink_counter += 1
-#         elseif r > cache.expand_threshold
-#             cache.p1 = min(cache.p1 * cache.p3, cache.p4)
-#             cache.shrink_counter = 0
-#         end
-
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#         else
-#             cache.make_new_J = false
-#         end
-
-#         @unpack p1 = cache
-#         cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
-#            cache.internalnorm(g) < cache.ϵ
-#             cache.force_stop = true
-#         end
-#     elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
-#         if r > cache.step_threshold
-#             take_step!(cache)
-#             cache.loss = cache.loss_new
-#             cache.make_new_J = true
-#             if retrospective_step!(cache) >= cache.expand_threshold
-#                 cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
-#             end
-
-#         else
-#             cache.make_new_J = false
-#             cache.trust_r *= cache.p2
-#             cache.shrink_counter += 1
-#         end
-#         if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
-#             cache.force_stop = true
-#         end
-#     end
-# end
-
-# function dogleg!(cache::TrustRegionCache)
-#     @unpack u_tmp, trust_r = cache
-
-#     # Test if the full step is within the trust region.
-#     if norm(u_tmp) ≤ trust_r
-#         cache.step_size = deepcopy(u_tmp)
-#         return
-#     end
-
-#     # Calcualte Cauchy point, optimum along the steepest descent direction.
-#     δsd = -cache.g
-#     norm_δsd = norm(δsd)
-#     if norm_δsd ≥ trust_r
-#         cache.step_size = δsd .* trust_r / norm_δsd
-#         return
-#     end
-
-#     # Find the intersection point on the boundary.
-#     N_sd = u_tmp - δsd
-#     dot_N_sd = dot(N_sd, N_sd)
-#     dot_sd_N_sd = dot(δsd, N_sd)
-#     dot_sd = dot(δsd, δsd)
-#     fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
-#     τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
-#     cache.step_size = δsd + τ * N_sd
-# end
-
-# function take_step!(cache::TrustRegionCache{true})
-#     cache.u_prev .= cache.u
-#     cache.u .= cache.u_tmp
-#     cache.fu_prev .= cache.fu
-#     cache.fu .= cache.fu_new
-# end
-
-# function take_step!(cache::TrustRegionCache{false})
-#     cache.u_prev = cache.u
-#     cache.u = cache.u_tmp
-#     cache.fu_prev = cache.fu
-#     cache.fu = cache.fu_new
-# end
-
-# function jvp!(cache::TrustRegionCache{false})
-#     @unpack f, u, fu, p = cache
-#     if isa(u, Number)
-#         return value_derivative(x -> f(x, p), u)
-#     end
-#     return auto_jacvec(x -> f(x, p), u, fu)
-# end
-
-# function jvp!(cache::TrustRegionCache{true})
-#     @unpack g, f, u, fu, p = cache
-#     if isa(u, Number)
-#         return value_derivative(x -> f(x, p), u)
-#     end
-#     auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu)
-#     g
-# end
-
-# function SciMLBase.solve!(cache::TrustRegionCache)
-#     while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
-#               cache.shrink_counter < cache.alg.max_shrink_times
-#         perform_step!(cache)
-#         cache.stats.nsteps += 1
-#     end
-
-#     if cache.stats.nsteps == cache.maxiters
-#         cache.retcode = ReturnCode.MaxIters
-#     else
-#         cache.retcode = ReturnCode.Success
-#     end
-
-#     SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu;
-#         retcode = cache.retcode, stats = cache.stats)
-# end
-
-# function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
-#     abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
-#     cache.p = p
-#     if iip
-#         recursivecopy!(cache.u, u0)
-#         cache.f(cache.fu, cache.u, p)
-#     else
-#         # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
-#         cache.u = u0
-#         cache.fu = cache.f(cache.u, p)
-#     end
-#     cache.abstol = abstol
-#     cache.maxiters = maxiters
-#     cache.stats.nf = 1
-#     cache.stats.nsteps = 1
-#     cache.force_stop = false
-#     cache.retcode = ReturnCode.Default
-#     cache.make_new_J = true
-#     cache.loss = get_loss(cache.fu)
-#     cache.shrink_counter = 0
-#     cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
-#     if iszero(cache.trust_r)
-#         cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
-#     end
-#     return cache
-# end
+@concrete mutable struct TrustRegionCache{iip, trustType, floatType}
+    f
+    alg
+    u_prev
+    u
+    fu_prev
+    fu
+    fu2
+    p
+    uf
+    linsolve
+    J
+    jac_cache
+    force_stop::Bool
+    maxiters::Int
+    internalnorm
+    retcode::ReturnCode.T
+    abstol
+    prob
+    radius_update_scheme::RadiusUpdateSchemes.T
+    trust_r::trustType
+    max_trust_r::trustType
+    step_threshold
+    shrink_threshold::trustType
+    expand_threshold::trustType
+    shrink_factor::trustType
+    expand_factor::trustType
+    loss::floatType
+    loss_new::floatType
+    H
+    g
+    shrink_counter::Int
+    step_size
+    u_tmp
+    fu_new
+    make_new_J::Bool
+    r::floatType
+    p1::floatType
+    p2::floatType
+    p3::floatType
+    p4::floatType
+    ϵ::floatType
+    stats::NLStats
+end
+
+function SciMLBase.__init(prob::NonlinearProblem{uType, iip}, alg::TrustRegion, args...;
+    alias_u0 = false, maxiters = 1000, abstol = 1e-8, internalnorm = DEFAULT_NORM,
+    kwargs...) where {uType, iip}
+    @unpack f, u0, p = prob
+    u = alias_u0 ? u0 : deepcopy(u0)
+    u_prev = zero(u)
+    if iip
+        fu1 = f.resid_prototype === nothing ? zero(u) : f.resid_prototype
+        f(fu1, u, p)
+    else
+        fu1 = f(u, p)
+    end
+    fu_prev = zero(fu1)
+
+    loss = get_loss(fu1)
+    uf, linsolve, J, fu2, jac_cache, du = jacobian_caches(alg, f, u, p, Val(iip))
+
+    radius_update_scheme = alg.radius_update_scheme
+    max_trust_radius = convert(eltype(u), alg.max_trust_radius)
+    initial_trust_radius = convert(eltype(u), alg.initial_trust_radius)
+    step_threshold = convert(eltype(u), alg.step_threshold)
+    shrink_threshold = convert(eltype(u), alg.shrink_threshold)
+    expand_threshold = convert(eltype(u), alg.expand_threshold)
+    shrink_factor = convert(eltype(u), alg.shrink_factor)
+    expand_factor = convert(eltype(u), alg.expand_factor)
+    # Set default trust region radius if not specified
+    if iszero(max_trust_radius)
+        max_trust_radius = convert(eltype(u), max(norm(fu1), maximum(u) - minimum(u)))
+    end
+    if iszero(initial_trust_radius)
+        initial_trust_radius = convert(eltype(u), max_trust_radius / 11)
+    end
+
+    loss_new = loss
+    H = zero(J)
+    g = _mutable_zero(fu1)
+    shrink_counter = 0
+    step_size = zero(u)
+    fu_new = zero(fu1)
+    make_new_J = true
+    r = loss
+
+    # Parameters for the Schemes
+    p1 = convert(eltype(u), 0.0)
+    p2 = convert(eltype(u), 0.0)
+    p3 = convert(eltype(u), 0.0)
+    p4 = convert(eltype(u), 0.0)
+    ϵ = convert(eltype(u), 1.0e-8)
+    if radius_update_scheme === RadiusUpdateSchemes.Hei
+        step_threshold = convert(eltype(u), 0.0)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.25)
+        p1 = convert(eltype(u), 5.0) # M
+        p2 = convert(eltype(u), 0.1) # β
+        p3 = convert(eltype(u), 0.15) # γ1
+        p4 = convert(eltype(u), 0.15) # γ2
+        initial_trust_radius = convert(eltype(u), 1.0)
+    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+        step_threshold = convert(eltype(u), 0.0001)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.25)
+        p1 = convert(eltype(u), 2.0) # μ
+        p2 = convert(eltype(u), 1 / 6) # c5
+        p3 = convert(eltype(u), 6.0) # c6
+        p4 = convert(eltype(u), 0.0)
+        if iip
+            auto_jacvec!(g, (fu, x) -> f(fu, x, p), u, fu1)
+        else
+            if isa(u, Number)
+                g = ForwardDiff.derivative(x -> f(x, p), u)
+            else
+                g = auto_jacvec(x -> f(x, p), u, fu1)
+            end
+        end
+        initial_trust_radius = convert(eltype(u), p1 * norm(g))
+    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+        step_threshold = convert(eltype(u), 0.0001)
+        shrink_threshold = convert(eltype(u), 0.25)
+        expand_threshold = convert(eltype(u), 0.75)
+        p1 = convert(eltype(u), 0.1) # μ
+        p2 = convert(eltype(u), 1 / 4) # c5
+        p3 = convert(eltype(u), 12) # c6
+        p4 = convert(eltype(u), 1.0e18) # M
+        initial_trust_radius = convert(eltype(u), p1 * (norm(fu1)^0.99))
+    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+        step_threshold = convert(eltype(u), 0.05)
+        shrink_threshold = convert(eltype(u), 0.05)
+        expand_threshold = convert(eltype(u), 0.9)
+        p1 = convert(eltype(u), 2.5)  #alpha_1
+        p2 = convert(eltype(u), 0.25) # alpha_2
+        p3 = convert(eltype(u), 0) # not required
+        p4 = convert(eltype(u), 0) # not required
+        initial_trust_radius = convert(eltype(u), 1.0)
+    end
+
+    return TrustRegionCache{iip}(f, alg, u_prev, u, fu_prev, fu1, fu2, p, uf, linsolve, J,
+        jac_cache, false, maxiters, internalnorm, ReturnCode.Default, abstol, prob,
+        radius_update_scheme, initial_trust_radius, max_trust_radius, step_threshold,
+        shrink_threshold, expand_threshold, shrink_factor, expand_factor, loss, loss_new,
+        H, g, shrink_counter, step_size, du, fu_new, make_new_J, r, p1, p2, p3, p4, ϵ,
+        NLStats(1, 0, 0, 0, 0))
+end
+
+isinplace(::TrustRegionCache{iip}) where {iip} = iip
+
+function perform_step!(cache::TrustRegionCache{true})
+    @unpack make_new_J, J, fu, f, u, p, u_tmp, alg, linsolve = cache
+    if cache.make_new_J
+        jacobian!!(J, cache)
+        mul!(cache.H, J, J)
+        mul!(cache.g, J, fu)
+        cache.stats.njacs += 1
+    end
+
+    linres = dolinsolve(alg.precs, linsolve; A = cache.H, b = _vec(cache.g),
+        linu = _vec(u_tmp), p, reltol = cache.abstol)
+    cache.linsolve = linres.cache
+    cache.u_tmp .= -1 .* u_tmp
+    dogleg!(cache)
+
+    # Compute the potentially new u
+    cache.u_tmp .= u .+ cache.step_size
+    f(cache.fu_new, cache.u_tmp, p)
+    trust_region_step!(cache)
+    cache.stats.nf += 1
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
+    return nothing
+end
+
+function perform_step!(cache::TrustRegionCache{false})
+    @unpack make_new_J, fu, f, u, p = cache
+
+    if make_new_J
+        J = jacobian!!(cache.J, cache)
+        cache.H = J * J
+        cache.g = J * fu
+        cache.stats.njacs += 1
+    end
+
+    @unpack g, H = cache
+    # Compute the Newton step.
+    cache.u_tmp = -H \ g
+    dogleg!(cache)
+
+    # Compute the potentially new u
+    cache.u_tmp = u .+ cache.step_size
+    cache.fu_new = f(cache.u_tmp, p)
+    trust_region_step!(cache)
+    cache.stats.nf += 1
+    cache.stats.nsolve += 1
+    cache.stats.nfactors += 1
+    return nothing
+end
+
+function retrospective_step!(cache::TrustRegionCache)
+    @unpack J, fu_prev, fu, u_prev, u = cache
+    J = jacobian!!(deepcopy(J), cache)
+    if J isa Number
+        cache.H = J * J
+        cache.g = J * fu
+    else
+        mul!(cache.H, J, J)
+        mul!(cache.g, J, fu)
+    end
+    cache.stats.njacs += 1
+    @unpack H, g, step_size = cache
+
+    return -(get_loss(fu_prev) - get_loss(fu)) /
+           (step_size' * g + step_size' * H * step_size / 2)
+end
+
+function trust_region_step!(cache::TrustRegionCache)
+    @unpack fu_new, step_size, g, H, loss, max_trust_r, radius_update_scheme = cache
+    cache.loss_new = get_loss(fu_new)
+
+    # Compute the ratio of the actual reduction to the predicted reduction.
+    cache.r = -(loss - cache.loss_new) / (step_size' * g + step_size' * H * step_size / 2)
+    @unpack r = cache
+
+    if radius_update_scheme === RadiusUpdateSchemes.Simple
+        # Update the trust region radius.
+        if r < cache.shrink_threshold
+            cache.trust_r *= cache.shrink_factor
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+        end
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+
+            # Update the trust region radius.
+            if r > cache.expand_threshold
+                cache.trust_r = min(cache.expand_factor * cache.trust_r, max_trust_r)
+            end
+
+            cache.make_new_J = true
+        else
+            # No need to make a new J, no step was taken, so we try again with a smaller trust_r
+            cache.make_new_J = false
+        end
+
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+            cache.force_stop = true
+        end
+
+    elseif radius_update_scheme === RadiusUpdateSchemes.Hei
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+        # Hei's radius update scheme
+        @unpack shrink_threshold, p1, p2, p3, p4 = cache
+        if rfunc(r, shrink_threshold, p1, p3, p4, p2) * cache.internalnorm(step_size) <
+           cache.trust_r
+            cache.shrink_counter += 1
+        else
+            cache.shrink_counter = 0
+        end
+        cache.trust_r = rfunc(r, shrink_threshold, p1, p3, p4, p2) *
+                        cache.internalnorm(step_size)
+
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+
+    elseif radius_update_scheme === RadiusUpdateSchemes.Yuan
+        if r < cache.shrink_threshold
+            cache.p1 = cache.p2 * cache.p1
+            cache.shrink_counter += 1
+        elseif r >= cache.expand_threshold &&
+               cache.internalnorm(step_size) > cache.trust_r / 2
+            cache.p1 = cache.p3 * cache.p1
+            cache.shrink_counter = 0
+        end
+
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+
+        @unpack p1 = cache
+        cache.trust_r = p1 * cache.internalnorm(jvp!(cache))
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+        #Fan's update scheme
+    elseif radius_update_scheme === RadiusUpdateSchemes.Fan
+        if r < cache.shrink_threshold
+            cache.p1 *= cache.p2
+            cache.shrink_counter += 1
+        elseif r > cache.expand_threshold
+            cache.p1 = min(cache.p1 * cache.p3, cache.p4)
+            cache.shrink_counter = 0
+        end
+
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+        else
+            cache.make_new_J = false
+        end
+
+        @unpack p1 = cache
+        cache.trust_r = p1 * (cache.internalnorm(cache.fu)^0.99)
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol ||
+           cache.internalnorm(g) < cache.ϵ
+            cache.force_stop = true
+        end
+    elseif radius_update_scheme === RadiusUpdateSchemes.Bastin
+        if r > cache.step_threshold
+            take_step!(cache)
+            cache.loss = cache.loss_new
+            cache.make_new_J = true
+            if retrospective_step!(cache) >= cache.expand_threshold
+                cache.trust_r = max(cache.p1 * cache.internalnorm(step_size), cache.trust_r)
+            end
+
+        else
+            cache.make_new_J = false
+            cache.trust_r *= cache.p2
+            cache.shrink_counter += 1
+        end
+        if iszero(cache.fu) || cache.internalnorm(cache.fu) < cache.abstol
+            cache.force_stop = true
+        end
+    end
+end
+
+function dogleg!(cache::TrustRegionCache)
+    @unpack u_tmp, trust_r = cache
+
+    # Test if the full step is within the trust region.
+    if norm(u_tmp) ≤ trust_r
+        cache.step_size = deepcopy(u_tmp)
+        return
+    end
+
+    # Calcualte Cauchy point, optimum along the steepest descent direction.
+    δsd = -cache.g
+    norm_δsd = norm(δsd)
+    if norm_δsd ≥ trust_r
+        cache.step_size = δsd .* trust_r / norm_δsd
+        return
+    end
+
+    # Find the intersection point on the boundary.
+    N_sd = u_tmp - δsd
+    dot_N_sd = dot(N_sd, N_sd)
+    dot_sd_N_sd = dot(δsd, N_sd)
+    dot_sd = dot(δsd, δsd)
+    fact = dot_sd_N_sd^2 - dot_N_sd * (dot_sd - trust_r^2)
+    τ = (-dot_sd_N_sd + sqrt(fact)) / dot_N_sd
+    cache.step_size = δsd + τ * N_sd
+end
+
+function take_step!(cache::TrustRegionCache{true})
+    cache.u_prev .= cache.u
+    cache.u .= cache.u_tmp
+    cache.fu_prev .= cache.fu
+    cache.fu .= cache.fu_new
+end
+
+function take_step!(cache::TrustRegionCache{false})
+    cache.u_prev = cache.u
+    cache.u = cache.u_tmp
+    cache.fu_prev = cache.fu
+    cache.fu = cache.fu_new
+end
+
+function jvp!(cache::TrustRegionCache{false})
+    @unpack f, u, fu, uf = cache
+    if isa(u, Number)
+        return value_derivative(uf, u)
+    end
+    return auto_jacvec(uf, u, fu)
+end
+
+function jvp!(cache::TrustRegionCache{true})
+    @unpack g, f, u, fu, uf = cache
+    if isa(u, Number)
+        return value_derivative(uf, u)
+    end
+    auto_jacvec!(g, uf, u, fu)
+    return g
+end
+
+function SciMLBase.solve!(cache::TrustRegionCache)
+    while !cache.force_stop && cache.stats.nsteps < cache.maxiters &&
+              cache.shrink_counter < cache.alg.max_shrink_times
+        perform_step!(cache)
+        cache.stats.nsteps += 1
+    end
+
+    if cache.stats.nsteps == cache.maxiters
+        cache.retcode = ReturnCode.MaxIters
+    else
+        cache.retcode = ReturnCode.Success
+    end
+
+    return SciMLBase.build_solution(cache.prob, cache.alg, cache.u, cache.fu; cache.retcode,
+        cache.stats)
+end
+
+function SciMLBase.reinit!(cache::TrustRegionCache{iip}, u0 = cache.u; p = cache.p,
+    abstol = cache.abstol, maxiters = cache.maxiters) where {iip}
+    cache.p = p
+    if iip
+        recursivecopy!(cache.u, u0)
+        cache.f(cache.fu, cache.u, p)
+    else
+        # don't have alias_u0 but cache.u is never mutated for OOP problems so it doesn't matter
+        cache.u = u0
+        cache.fu = cache.f(cache.u, p)
+    end
+    cache.abstol = abstol
+    cache.maxiters = maxiters
+    cache.stats.nf = 1
+    cache.stats.nsteps = 1
+    cache.force_stop = false
+    cache.retcode = ReturnCode.Default
+    cache.make_new_J = true
+    cache.loss = get_loss(cache.fu)
+    cache.shrink_counter = 0
+    cache.trust_r = convert(eltype(cache.u), cache.alg.initial_trust_radius)
+    if iszero(cache.trust_r)
+        cache.trust_r = convert(eltype(cache.u), cache.max_trust_r / 11)
+    end
+    return cache
+end
diff --git a/src/utils.jl b/src/utils.jl
index c50d52ad7..3df540632 100644
--- a/src/utils.jl
+++ b/src/utils.jl
@@ -37,21 +37,21 @@ function default_adargs_to_adtype(; chunk_size = Val{0}(), autodiff = Val{true}(
     return ad
 end
 
-# """
-# value_derivative(f, x)
-
-# Compute `f(x), d/dx f(x)` in the most efficient way.
-# """
-# function value_derivative(f::F, x::R) where {F, R}
-#     T = typeof(ForwardDiff.Tag(f, R))
-#     out = f(ForwardDiff.Dual{T}(x, one(x)))
-#     ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
-# end
-
-# # Todo: improve this dispatch
-# function value_derivative(f::F, x::StaticArraysCore.SVector) where {F}
-#     f(x), ForwardDiff.jacobian(f, x)
-# end
+"""
+value_derivative(f, x)
+
+Compute `f(x), d/dx f(x)` in the most efficient way.
+"""
+function value_derivative(f::F, x::R) where {F, R}
+    T = typeof(ForwardDiff.Tag(f, R))
+    out = f(ForwardDiff.Dual{T}(x, one(x)))
+    ForwardDiff.value(out), ForwardDiff.extract_derivative(T, out)
+end
+
+# Todo: improve this dispatch
+function value_derivative(f::F, x::SVector) where {F}
+    f(x), ForwardDiff.jacobian(f, x)
+end
 
 @inline value(x) = x
 @inline value(x::Dual) = ForwardDiff.value(x)
@@ -128,3 +128,17 @@ end
 
 concrete_jac(_) = nothing
 concrete_jac(::AbstractNewtonAlgorithm{CJ}) where {CJ} = CJ
+
+# Circumventing https://github.com/SciML/RecursiveArrayTools.jl/issues/277
+_iszero(x) = iszero(x)
+_iszero(x::ArrayPartition) = all(_iszero, x.x)
+
+_mutable_zero(x) = zero(x)
+_mutable_zero(x::SArray) = MArray(x)
+
+_mutable(x) = x
+_mutable(x::SArray) = MArray(x)
+_maybe_mutable(x, ::AbstractFiniteDifferencesMode) = _mutable(x)
+# The shadow allocated for Enzyme needs to be mutable
+_maybe_mutable(x, ::AutoSparseEnzyme) = _mutable(x)
+_maybe_mutable(x, _) = x
diff --git a/test/23_test_cases.jl b/test/23_test_cases.jl
deleted file mode 100644
index 3cb0eb310..000000000
--- a/test/23_test_cases.jl
+++ /dev/null
@@ -1,510 +0,0 @@
-using NonlinearSolve, NLsolve, LinearAlgebra
-
-# Implementation of the 23 test problems in
-# [test_nonlin](https://people.sc.fsu.edu/~jburkardt/m_src/test_nonlin/test_nonlin.html)
-
-# ------------------------------------- Problem 1 ------------------------------------------
-function p1_f!(out, x, p = nothing)
-    n = length(x)
-    out[1] = 1.0 - x[1]
-    out[2:n] .= 10.0 .* (x[2:n] .- x[1:(n - 1)] .* x[1:(n - 1)])
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n)
-x_start[1] = -1.2
-p1_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Generalized Rosenbrock function")
-
-# ------------------------------------- Problem 2 ------------------------------------------
-function p2_f!(out, x, p = nothing)
-    out[1] = x[1] + 10.0 * x[2]
-    out[2] = sqrt(5.0) * (x[3] - x[4])
-    out[3] = (x[2] - 2.0 * x[3])^2
-    out[4] = sqrt(10.0) * (x[1] - x[4]) * (x[1] - x[4])
-    nothing
-end
-
-n = 4
-x_sol = zeros(n)
-x_start = [3.0, -1.0, 0.0, 1.0]
-p2_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell singular function")
-
-# ------------------------------------- Problem 3 ------------------------------------------
-function p3_f!(out, x, p = nothing)
-    out[1] = 10000.0 * x[1] * x[2] - 1.0
-    out[2] = exp(-x[1]) + exp(-x[2]) - 1.0001
-    nothing
-end
-
-n = 2
-x_sol = [1.098159e-05, 9.106146]
-x_start = [0.0, 1.0]
-p3_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Powell badly scaled function")
-
-# ------------------------------------- Problem 4 ------------------------------------------
-function p4_f!(out, x, p = nothing)
-    temp1 = x[2] - x[1] * x[1]
-    temp2 = x[4] - x[3] * x[3]
-
-    out[1] = -200.0 * x[1] * temp1 - (1.0 - x[1])
-    out[2] = 200.0 * temp1 + 20.2 * (x[2] - 1.0) + 19.8 * (x[4] - 1.0)
-    out[3] = -180.0 * x[3] * temp2 - (1.0 - x[3])
-    out[4] = 180.0 * temp2 + 20.2 * (x[4] - 1.0) + 19.8 * (x[2] - 1.0)
-    nothing
-end
-
-n = 4
-x_sol = ones(n)
-x_start = [-3.0, -1.0, -3.0, -1.0]
-p4_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Wood function")
-
-# ------------------------------------- Problem 5 ------------------------------------------
-function p5_f!(out, x, p = nothing)
-    if 0.0 < x[1]
-        temp = atan(x[2] / x[1]) / (2.0 * pi)
-    elseif x[1] < 0.0
-        temp = atan(x[2] / x[1]) / (2.0 * pi) + 0.5
-    else
-        temp = 0.25 * sign(x[2])
-    end
-
-    out[1] = 10.0 * (x[3] - 10.0 * temp)
-    out[2] = 10.0 * (sqrt(x[1] * x[1] + x[2] * x[2]) - 1.0)
-    out[3] = x[3]
-    nothing
-end
-
-n = 3
-x_sol = [1.0, 0.0, 0.0]
-x_start = [-1.0, 0.0, 0.0]
-p5_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Helical valley function")
-
-# ------------------------------------- Problem 6 ------------------------------------------
-function p6_f!(out, x, p = nothing)
-    n = length(x)
-    for i in 1:29
-        ti = i / 29.0
-        sum1 = 0.0
-        temp = 1.0
-        for j in 2:n
-            sum1 = sum1 + j * temp * x[j]
-            temp = ti * temp
-        end
-
-        sum2 = 0.0
-        temp = 1.0
-        for j in 1:n
-            sum2 = sum2 + temp * x[j]
-            temp = ti * temp
-        end
-        temp = 1.0 / ti
-
-        for k in 1:n
-            out[k] = out[k] + temp * (sum1 - sum2 * sum2 - 1.0) * (k - 2.0 * ti * sum2)
-            temp = ti * temp
-        end
-    end
-
-    out[1] = out[1] + 3.0 * x[1] - 2.0 * x[1] * x[1] + 2.0 * x[1]^3
-    out[2] = out[2] + x[2] - x[2]^2 - 1.0
-    nothing
-end
-
-n = 2
-x_sol = []
-x_start = zeros(n)
-p6_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Watson function")
-
-# ------------------------------------- Problem 7 ------------------------------------------
-function p7_f!(out, x, p = nothing)
-    n = length(x)
-    out .= 0.0
-    for j in 1:n
-        t1 = 1.0
-        t2 = x[j]
-        for i in 1:n
-            out[i] += t2
-            t3 = 2.0 * x[j] * t2 - t1
-            t1 = t2
-            t2 = t3
-        end
-    end
-    out ./= n
-
-    for i in 1:n
-        ip1 = i
-        if ip1 % 2 == 0
-            out[i] = out[i] + 1.0 / (ip1 * ip1 - 1)
-        end
-    end
-    nothing
-end
-
-n = 2
-x_sol = [0.2113248654051871, 0.7886751345948129]
-x_sol .= 2.0 .* x_sol .- 1.0
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (2 * i - n) / (n + 1)
-end
-p7_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chebyquad function")
-
-# ------------------------------------- Problem 8 ------------------------------------------
-function p8_f!(out, x, p = nothing)
-    n = length(x)
-    out[1:(n - 1)] .= x[1:(n - 1)] .+ sum(x) .- (n + 1)
-    out[n] = prod(x) - 1.0
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = ones(n) ./ 2
-p8_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Brown almost linear function")
-
-# ------------------------------------- Problem 9 ------------------------------------------
-function p9_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        out[k] = 2.0 * x[k] + 0.5 * h^2 * (x[k] + k * h + 1.0)^3
-        if 1 < k
-            out[k] = out[k] - x[k - 1]
-        end
-        if k < n
-            out[k] = out[k] - x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p9_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete boundary value function")
-
-# ------------------------------------- Problem 10 -----------------------------------------
-function p10_f!(out, x, p = nothing)
-    n = length(x)
-    h = 1.0 / (n + 1)
-    for k in 1:n
-        tk = k / (n + 1)
-        sum1 = 0.0
-        for j in 1:k
-            tj = j * h
-            sum1 = sum1 + tj * (x[j] + tj + 1.0)^3
-        end
-        sum2 = 0.0
-        for j in k:n
-            tj = j * h
-            sum2 = sum2 + (1.0 - tj) * (x[j] + tj + 1.0)^3
-        end
-
-        out[k] = x[k] + h * ((1.0 - tk) * sum1 + tk * sum2) / 2.0
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = (i * (i - n - 1)) / (n + 1)^2
-end
-p10_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Discrete integral equation function")
-
-# ------------------------------------- Problem 11 -----------------------------------------
-function p11_f!(out, x, p = nothing)
-    n = length(x)
-    c_sum = sum(cos.(x))
-    for k in 1:n
-        out[k] = n - c_sum + k * (1.0 - cos(x[k])) - sin(x[k])
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) / n
-p11_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Trigonometric function")
-
-# ------------------------------------- Problem 12 -----------------------------------------
-function p12_f!(out, x, p = nothing)
-    n = length(x)
-    sum1 = 0.0
-    for j in 1:n
-        sum1 += j * (x[j] - 1.0)
-    end
-    for k in 1:n
-        out[k] = x[k] - 1.0 + k * sum1 * (1.0 + 2.0 * sum1 * sum1)
-    end
-    nothing
-end
-
-n = 10
-x_sol = ones(n)
-x_start = zeros(n)
-for i in 1:n
-    x_start[i] = 1.0 - i / n
-end
-p12_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Variably dimensioned function")
-
-# ------------------------------------- Problem 13 -----------------------------------------
-function p13_f!(out, x, p = nothing)
-    n = length(x)
-    for k in 1:n
-        out[k] = (3.0 - 2.0 * x[k]) * x[k] + 1.0
-        if 1 < k
-            out[k] -= x[k - 1]
-        end
-        if k < n
-            out[k] -= 2.0 * x[k + 1]
-        end
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p13_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden tridiagonal function")
-
-# ------------------------------------- Problem 14 -----------------------------------------
-function p14_f!(out, x, p = nothing)
-    n = length(x)
-    ml = 5
-    mu = 1
-    for k in 1:n
-        k1 = max(1, k - ml)
-        k2 = min(n, k + mu)
-
-        temp = 0.0
-        for j in k1:k2
-            if j != k
-                temp += x[j] * (1.0 + x[j])
-            end
-        end
-        out[k] = x[k] * (2.0 + 5.0 * x[k] * x[k]) + 1.0 - temp
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n) .* (-1.0)
-p14_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Broyden banded function")
-
-# ------------------------------------- Problem 15 -----------------------------------------
-function p15_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[3]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[4]) - 1.0
-    out[3] = (x[3] * x[1] + x[4] * x[3]) - 0.0
-    out[4] = (x[3] * x[2] + x[4] * x[4]) - 0.0001
-    nothing
-end
-
-n = 4
-x_sol = [0.01, 50.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 1.0]
-p15_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 2 by 2 matrix square root problem")
-
-# ------------------------------------- Problem 16 -----------------------------------------
-function p16_f!(out, x, p = nothing)
-    out[1] = (x[1] * x[1] + x[2] * x[4] + x[3] * x[7]) - 0.0001
-    out[2] = (x[1] * x[2] + x[2] * x[5] + x[3] * x[8]) - 1.0
-    out[3] = x[1] * x[3] + x[2] * x[6] + x[3] * x[9]
-    out[4] = x[4] * x[1] + x[5] * x[4] + x[6] * x[7]
-    out[5] = (x[4] * x[2] + x[5] * x[5] + x[6] * x[8]) - 0.0001
-    out[6] = x[4] * x[3] + x[5] * x[6] + x[6] * x[9]
-    out[7] = x[7] * x[1] + x[8] * x[4] + x[9] * x[7]
-    out[8] = x[7] * x[2] + x[8] * x[5] + x[9] * x[8]
-    out[9] = (x[7] * x[3] + x[8] * x[6] + x[9] * x[9]) - 0.0001
-    nothing
-end
-
-n = 9
-x_sol = [0.01, 50.0, 0.0, 0.0, 0.01, 0.0, 0.0, 0.0, 0.01]
-x_start = [1.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 1.0]
-p16_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Hammarling 3 by 3 matrix square root problem")
-
-# ------------------------------------- Problem 17 -----------------------------------------
-function p17_f!(out, x, p = nothing)
-    out[1] = x[1] + x[2] - 3.0
-    out[2] = x[1]^2 + x[2]^2 - 9.0
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 3.0]
-x_start = [1.0, 5.0]
-p17_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Dennis and Schnabel 2 by 2 example")
-
-# ------------------------------------- Problem 18 -----------------------------------------
-function p18_f!(out, x, p = nothing)
-    if x[1] != 0.0
-        out[1] = x[2]^2 * (1.0 - exp(-x[1] * x[1])) / x[1]
-    else
-        out[1] = 0.0
-    end
-    if x[2] != 0.0
-        out[2] = x[1] * (1.0 - exp(-x[2] * x[2])) / x[2]
-    else
-        out[2] = 0.0
-    end
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [2.0, 2.0]
-p18_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 18")
-
-# ------------------------------------- Problem 19 -----------------------------------------
-function p19_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1]^2 + x[2]^2)
-    out[2] = x[2] * (x[1]^2 + x[2]^2)
-    nothing
-end
-
-n = 2
-x_sol = zeros(n)
-x_start = [3.0, 3.0]
-p19_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Sample problem 19")
-
-# ------------------------------------- Problem 20 -----------------------------------------
-function p20_f!(out, x, p = nothing)
-    out[1] = x[1] * (x[1] - 5.0)^2
-    nothing
-end
-
-n = 1
-x_sol = [5.0] # OR [0.0]...
-x_start = [1.0]
-p20_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Scalar problem f(x) = x(x - 5)^2")
-
-# ------------------------------------- Problem 21 -----------------------------------------
-function p21_f!(out, x, p = nothing)
-    out[1] = x[1] - x[2]^3 + 5.0 * x[2]^2 - 2.0 * x[2] - 13.0
-    out[2] = x[1] + x[2]^3 + x[2]^2 - 14.0 * x[2] - 29.0
-    nothing
-end
-
-n = 2
-x_sol = [5.0, 4.0]
-x_start = [0.5, -2.0]
-p21_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Freudenstein-Roth function")
-
-# ------------------------------------- Problem 22 -----------------------------------------
-function p22_f!(out, x, p = nothing)
-    out[1] = x[1] * x[1] - x[2] + 1.0
-    out[2] = x[1] - cos(0.5 * pi * x[2])
-    nothing
-end
-
-n = 2
-x_sol = [0.0, 1.0]
-x_start = [1.0, 0.0]
-p22_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Boggs function")
-
-# ------------------------------------- Problem 23 -----------------------------------------
-function p23_f!(out, x, p = nothing)
-    c = 0.9
-    out[1:n] = x[1:n]
-    μ = zeros(n)
-    for i in 1:n
-        μ[i] = (2 * i) / (2 * n)
-    end
-    for i in 1:n
-        s = 0.0
-        for j in 1:n
-            s = s + (μ[i] * x[j]) / (μ[i] + μ[j])
-        end
-        term = 1.0 - c * s / (2 * n)
-        out[i] -= 1.0 / term
-    end
-    nothing
-end
-
-n = 10
-x_sol = []
-x_start = ones(n)
-p23_dict = Dict("n" => n, "start" => x_start, "sol" => x_sol,
-    "title" => "Chandrasekhar function")
-
-# ----------------------------------- Solve problems ---------------------------------------
-problems = (p1_f!, p2_f!, p3_f!, p4_f!, p5_f!, p6_f!, p7_f!, p8_f!, p9_f!, p10_f!, p11_f!,
-    p12_f!, p13_f!, p14_f!, p15_f!, p16_f!, p17_f!, p18_f!, p19_f!, p20_f!, p21_f!,
-    p22_f!, p23_f!)
-dicts = (p1_dict, p2_dict, p3_dict, p4_dict, p5_dict, p6_dict, p7_dict, p8_dict, p9_dict,
-    p10_dict, p11_dict, p12_dict, p13_dict, p14_dict, p15_dict, p16_dict, p17_dict,
-    p18_dict, p19_dict, p20_dict, p21_dict, p22_dict, p23_dict)
-algs = (NewtonRaphson(), TrustRegion(), LevenbergMarquardt())
-names = ("NewtonRaphson", "TrustRegion", "LevenbergMarquardt")
-
-for (problem, dict) in zip(problems, dicts)
-    for (alg, name) in zip(algs, names)
-        local x = dict["start"]
-        local nlprob = NonlinearProblem(problem, x)
-        local out = similar(x)
-        try
-            problem(out,
-                solve(nlprob, alg, abstol = 1e-15, reltol = 1e-15).u, nothing)
-            dict["error_" * name] = ""
-        catch
-            # println("Error in $name")
-            dict["error_" * name] = "(Singular error)"
-        end
-        dict["out_" * name] = out
-    end
-    local x = dict["start"]
-    local nlprob = NonlinearProblem(problem, x)
-    sol = nlsolve(problem, x, xtol = 1e-15, ftol = 1e-15)
-    dict["norm_nlsolve"] = sol.residual_norm
-end
-
-# ----------------------------------- Print results ----------------------------------------
-i_str = i_str = rpad("nr", 3, " ")
-title_str = rpad("Problem", 50, " ")
-n_str = rpad("n", 5, " ")
-norm_str = rpad(names[1], 20, " ") * rpad(names[2], 20, " ") * rpad(names[3], 20, " ") *
-           rpad("nlsolve", 20, " ")
-println("$i_str $title_str $n_str $norm_str")
-
-for (i, dict) in enumerate(dicts)
-    local i_str = rpad(string(i), 3, " ")
-    local title_str = rpad(dict["title"], 50, " ")
-    local n_str = rpad(string(dict["n"]), 5, " ")
-    local norm_str = ""
-    for (alg, name) in zip(algs, names)
-        norm_str *= rpad(string(trunc(norm(dict["out_" * name]); sigdigits = 5)), 20, " ")
-    end
-    norm_str *= rpad(string(round(dict["norm_nlsolve"]; sigdigits = 5)), 20, " ")
-    println("$i_str $title_str $n_str $norm_str")
-end
diff --git a/test/basictests.jl b/test/basictests.jl
index 05a0152fa..ee42db9f3 100644
--- a/test/basictests.jl
+++ b/test/basictests.jl
@@ -1,774 +1,672 @@
-using NonlinearSolve
-using StaticArrays
-using BenchmarkTools
-using LinearSolve
-using Random
-using LinearAlgebra
-using Test
+using BenchmarkTools, LinearSolve, NonlinearSolve, StaticArrays, Random, LinearAlgebra,
+    Test, ForwardDiff, Zygote, Enzyme, SparseDiffTools
 
-# --- NewtonRaphson tests ---
-
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, NewtonRaphson(), abstol = 1e-9))
-end
-
-function ff(u, p)
-    u .* u .- 2
-end
-const cu0 = @SVector[1.0, 1.0]
-function sf(u, p)
-    u * u - 2
-end
-const csu0 = 1.0
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-# @test (@ballocated benchmark_immutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_mutable(ff, cu0)) < 200
-# @test (@ballocated benchmark_scalar(sf, csu0)) < 400
-
-function benchmark_inplace(f, u0, linsolve, precs)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
-    sol = solve!(solver)
-end
+_nameof(x) = applicable(nameof, x) ? nameof(x) : _nameof(typeof(x))
 
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-precs = [
-    NonlinearSolve.DEFAULT_PRECS,
-    (args...) -> (Diagonal(rand!(similar(u0))), nothing)
-]
+# --- NewtonRaphson tests ---
 
-for prec in precs, linsolve in (nothing, KrylovJL_GMRES())
-    sol = benchmark_inplace(ffiip, u0, linsolve, prec)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
+@testset "NewtonRaphson" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0)
+        prob = NonlinearProblem{false}(f, u0, p)
+        cache = init(prob, NewtonRaphson(), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, NewtonRaphson(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) <= 64
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; linsolve, precs)
+        prob = NonlinearProblem{true}(f, u0, p)
+        cache = init(prob, NewtonRaphson(; linsolve, precs), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-# AD Tests
-using ForwardDiff
+    quadratic_f(u, p) = u .* u .- p
+    quadratic_f!(du, u, p) = (du .= u .* u .- p)
 
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+    @testset "[OOP] u0: $(typeof(u0))" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0)
+        sol = benchmark_nlsolve_oop(quadratic_f, u0)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-9)
-    return sol.u[end]
-end
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0), NewtonRaphson(),
+            abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-for p in 1.0:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    precs = [NonlinearSolve.DEFAULT_PRECS, :Random]
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    @testset "[IIP] u0: $(typeof(u0)) precs: $(_nameof(prec)) linsolve: $(_nameof(linsolve))" for u0 in ([
+            1.0, 1.0],), prec in precs, linsolve in (nothing, KrylovJL_GMRES())
+        if prec === :Random
+            prec = (args...) -> (Diagonal(randn!(similar(u0))), nothing)
+        end
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0; linsolve, precs = prec)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-# NewtonRaphson
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, NewtonRaphson(), abstol = 1e-10)
-    return sol.u
-end
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+            NewtonRaphson(; linsolve, precs = prec), abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
 
-@test ForwardDiff.derivative(g, 1.0) ≈ 0.5
+    # Immutable
+    @testset "[OOP] [Immutable AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, @SVector[1.0, 1.0], p)
+            res_true = sqrt(p)
+            all(res.u .≈ res_true)
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f,
+            @SVector[1.0, 1.0], p).u[end], p) ≈ 1 / (2 * sqrt(p))
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    @testset "[OOP] [Scalar AD] p: $(p)" for p in 1.0:0.1:100.0
+        @test begin
+            res = benchmark_nlsolve_oop(quadratic_f, 1.0, p)
+            res_true = sqrt(p)
+            res.u ≈ res_true
+        end
+        @test ForwardDiff.derivative(p -> benchmark_nlsolve_oop(quadratic_f, 1.0, p).u, p) ≈
+              1 / (2 * sqrt(p))
+    end
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, NewtonRaphson())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
+    quadratic_f2(u, p) = @. p[1] * u * u - p[2]
+    t = (p) -> [sqrt(p[2] / p[1])]
+    p = [0.9, 50.0]
+    @test benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u ≈ sqrt(p[2] / p[1])
+    @test ForwardDiff.jacobian(p -> [benchmark_nlsolve_oop(quadratic_f2, 0.5, p).u], p) ≈
+          ForwardDiff.jacobian(t, p)
+
+    # Iterator interface
+    function nlprob_iterator_interface(f, p_range, ::Val{iip}) where {iip}
+        probN = NonlinearProblem{iip}(f, iip ? [0.5] : 0.5, p_range[begin])
+        cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
+        sols = zeros(length(p_range))
+        for (i, p) in enumerate(p_range)
+            reinit!(cache, iip ? [cache.u[1]] : cache.u; p = p)
+            sol = solve!(cache)
+            sols[i] = iip ? sol.u[1] : sol.u
+        end
+        return sols
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, NewtonRaphson(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+    p = range(0.01, 2, length = 200)
+    @test nlprob_iterator_interface(quadratic_f, p, Val(false)) ≈ sqrt.(p)
+    @test nlprob_iterator_interface(quadratic_f!, p, Val(true)) ≈ sqrt.(p)
+
+    probN = NonlinearProblem(quadratic_f, @SVector[1.0, 1.0], 2.0)
+    @testset "ADType: $(autodiff) u0: $(u0)" for autodiff in (false, true,
+        AutoSparseForwardDiff(), AutoSparseFiniteDiff(), AutoZygote(), AutoSparseZygote(),
+        AutoSparseEnzyme()), u0 in (1.0, [1.0, 1.0], @SVector[1.0, 1.0])
+        probN = NonlinearProblem(quadratic_f, u0, 2.0)
+        @test all(solve(probN, NewtonRaphson(; autodiff)).u .≈ sqrt(2.0))
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, NewtonRaphson()).u[end] ≈ sqrt(2.0)
-@test solve(probN, NewtonRaphson(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson()).u ≈ sol
-    @test solve(probN, NewtonRaphson(; autodiff = false)).u ≈ sol
 end
 
 # --- TrustRegion tests ---
-
-function benchmark_immutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9))
-end
-
-function ff(u, p = nothing)
-    u .* u .- 2
-end
-
-function sf(u, p = nothing)
-    u * u - 2
-end
-
-u0 = [1.0, 1.0]
-radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
-    RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_immutable(ff, cu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_mutable(ff, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-    sol = benchmark_scalar(sf, csu0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test abs(sol.u * sol.u - 2) < 1e-9
-end
-
-function benchmark_inplace(f, u0, radius_update_scheme)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, TrustRegion(; radius_update_scheme), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p = nothing)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-for radius_update_scheme in radius_update_schemes
-    sol = benchmark_inplace(ffiip, u0, radius_update_scheme)
-    @test sol.retcode === ReturnCode.Success
-    @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-end
-
-for radius_update_scheme in radius_update_schemes
-    probN = NonlinearProblem{true}(ffiip, u0)
-    solver = init(probN, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9)
-    @test (@ballocated solve!(solver)) < 200
-end
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-9)
-    return sol.u[end]
-end
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
-
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
-
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
-
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion())
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
-    return [sol.u]
-end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Iterator interface
-f = (u, p) -> u * u - p
-g = function (p_range)
-    probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, cache.u; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u
-    end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
-g = function (p_range)
-    probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
-    cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
-    sols = zeros(length(p_range))
-    for (i, p) in enumerate(p_range)
-        reinit!(cache, [cache.u[1]]; p = p)
-        sol = solve!(cache)
-        sols[i] = sol.u[1]
+@testset "TrustRegion" begin
+    function benchmark_nlsolve_oop(f, u0, p = 2.0; radius_update_scheme)
+        prob = NonlinearProblem{false}(f, u0, p)
+        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
+        return solve!(cache)
     end
-    return sols
-end
-p = range(0.01, 2, length = 200)
-@test g(p) ≈ sqrt.(p)
-
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
-
-@test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
-@test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-@test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
-      sqrt(2.0)
-@test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
-      sqrt(2.0)
-
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion()).u ≈ sol
-    @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
-end
-
-# Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
-        abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# Test kwars in `TrustRegion`
-max_trust_radius = [10.0, 100.0, 1000.0]
-initial_trust_radius = [10.0, 1.0, 0.1]
-step_threshold = [0.0, 0.01, 0.25]
-shrink_threshold = [0.25, 0.3, 0.5]
-expand_threshold = [0.5, 0.8, 0.9]
-shrink_factor = [0.1, 0.3, 0.5]
-expand_factor = [1.5, 2.0, 3.0]
-max_shrink_times = [10, 20, 30]
-
-list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
-    shrink_threshold, expand_threshold, shrink_factor,
-    expand_factor, max_shrink_times)
-for options in list_of_options
-    local probN, sol, alg
-    alg = TrustRegion(max_trust_radius = options[1],
-        initial_trust_radius = options[2],
-        step_threshold = options[3],
-        shrink_threshold = options[4],
-        expand_threshold = options[5],
-        shrink_factor = options[6],
-        expand_factor = options[7],
-        max_shrink_times = options[8])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
-end
-
-# Testing consistency of iip vs oop iterations
-
-maxiterations = [2, 3, 4, 5]
-u0 = [1.0, 1.0]
-function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
-    prob_iip = NonlinearProblem{true}(fip, u0)
-    solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_iip = solve!(solver)
-
-    prob_oop = NonlinearProblem{false}(f, u0)
-    solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
-        abstol = 1e-9, maxiters = maxiters)
-    sol_oop = solve!(solver)
-
-    return sol_iip.u[end], sol_oop.u[end]
-end
 
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
-    @test iip == oop
-end
-
-for maxiters in maxiterations
-    iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
-    @test iip == oop
-end
-
-# --- LevenbergMarquardt tests ---
-
-function benchmark_immutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_mutable(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function benchmark_scalar(f, u0)
-    probN = NonlinearProblem{false}(f, u0)
-    sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
-end
-
-function ff(u, p)
-    u .* u .- 2
-end
-
-function sf(u, p)
-    u * u - 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_immutable(ff, cu0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_mutable(ff, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-sol = benchmark_scalar(sf, csu0)
-@test sol.retcode === ReturnCode.Success
-@test abs(sol.u * sol.u - 2) < 1e-9
-
-function benchmark_inplace(f, u0)
-    probN = NonlinearProblem{true}(f, u0)
-    solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-    sol = solve!(solver)
-end
-
-function ffiip(du, u, p)
-    du .= u .* u .- 2
-end
-u0 = [1.0, 1.0]
-
-sol = benchmark_inplace(ffiip, u0)
-@test sol.retcode === ReturnCode.Success
-@test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
-
-u0 = [1.0, 1.0]
-probN = NonlinearProblem{true}(ffiip, u0)
-solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
-@test (@ballocated solve!(solver)) < 120
-
-# AD Tests
-using ForwardDiff
-
-# Immutable
-f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
-
-g = function (p)
-    probN = NonlinearProblem{false}(f, csu0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
-    return sol.u[end]
-end
+    function benchmark_nlsolve_iip(f, u0, p = 2.0; radius_update_scheme)
+        prob = NonlinearProblem{true}(f, u0, p)
+        cache = init(prob, TrustRegion(; radius_update_scheme), abstol = 1e-9)
+        return solve!(cache)
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    quadratic_f(u, p) = u .* u .- p
+    quadratic_f!(du, u, p) = (du .= u .* u .- p)
 
-# Scalar
-f, u0 = (u, p) -> u * u - p, 1.0
+    radius_update_schemes = [RadiusUpdateSchemes.Simple, RadiusUpdateSchemes.Hei,
+        RadiusUpdateSchemes.Yuan, RadiusUpdateSchemes.Fan, RadiusUpdateSchemes.Bastin]
 
-g = function (p)
-    probN = NonlinearProblem{false}(f, oftype(p, u0), p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
+    @testset "[OOP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0], @SVector[1.0, 1.0], 1.0), radius_update_scheme in radius_update_schemes
+        sol = benchmark_nlsolve_oop(quadratic_f, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-@test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+        cache = init(NonlinearProblem{false}(quadratic_f, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) < 200
+    end
 
-for p in 1.1:0.1:100.0
-    @test g(p) ≈ sqrt(p)
-    @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
-end
+    @testset "[IIP] u0: $(typeof(u0)) radius_update_scheme: $(radius_update_scheme)" for u0 in ([1.0, 1.0],), radius_update_scheme in radius_update_schemes
+        sol = benchmark_nlsolve_iip(quadratic_f!, u0; radius_update_scheme)
+        @test SciMLBase.successful_retcode(sol)
+        @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
 
-f = (u, p) -> p[1] * u * u - p[2]
-t = (p) -> [sqrt(p[2] / p[1])]
-p = [0.9, 50.0]
-gnewton = function (p)
-    probN = NonlinearProblem{false}(f, 0.5, p)
-    sol = solve(probN, LevenbergMarquardt())
-    return [sol.u]
+        cache = init(NonlinearProblem{true}(quadratic_f!, u0, 2.0),
+            TrustRegion(; radius_update_scheme); abstol = 1e-9)
+        @test (@ballocated solve!($cache)) ≤ 64
+    end
 end
-@test gnewton(p) ≈ [sqrt(p[2] / p[1])]
-@test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
-
-# Error Checks
-f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
-probN = NonlinearProblem(f, u0)
 
-@test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
-@test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
 
-for u0 in [1.0, [1, 1.0]]
-    local f, probN, sol
-    f = (u, p) -> u .* u .- 2.0
-    probN = NonlinearProblem(f, u0)
-    sol = sqrt(2) * u0
-
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt()).u ≈ sol
-    @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
-end
-
-# Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
-u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
-global g, f
-f = (u, p) -> 0.010000000000000002 .+
-              10.000000000000002 ./ (1 .+
-               (0.21640425613334457 .+
-                216.40425613334457 ./ (1 .+
-                 (0.21640425613334457 .+
-                  216.40425613334457 ./
-                  (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
-              0.0011552453009332421u .- p
-g = function (p)
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
-    return sol.u
-end
-p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
-u = g(p)
-f(u, p)
-@test all(abs.(f(u, p)) .< 1e-10)
-
-# # Test kwars in `LevenbergMarquardt`
-damping_initial = [0.5, 2.0, 5.0]
-damping_increase_factor = [1.5, 3.0, 10.0]
-damping_decrease_factor = [2, 5, 10]
-finite_diff_step_geodesic = [0.02, 0.2, 0.3]
-α_geodesic = [0.6, 0.8, 0.9]
-b_uphill = [0, 1, 2]
-min_damping_D = [1e-12, 1e-9, 1e-4]
-
-list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
-    finite_diff_step_geodesic, α_geodesic, b_uphill,
-    min_damping_D)
-for options in list_of_options
-    local probN, sol, alg
-    alg = LevenbergMarquardt(damping_initial = options[1],
-        damping_increase_factor = options[2],
-        damping_decrease_factor = options[3],
-        finite_diff_step_geodesic = options[4],
-        α_geodesic = options[5],
-        b_uphill = options[6],
-        min_damping_D = options[7])
-
-    probN = NonlinearProblem{false}(f, u0, p)
-    sol = solve(probN, alg, abstol = 1e-10)
-    @test all(abs.(f(u, p)) .< 1e-10)
-end
+# # Immutable
+# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# # Scalar
+# f, u0 = (u, p) -> u * u - p, 1.0
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# f = (u, p) -> p[1] * u * u - p[2]
+# t = (p) -> [sqrt(p[2] / p[1])]
+# p = [0.9, 50.0]
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion())
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin))
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# # Iterator interface
+# f = (u, p) -> u * u - p
+# g = function (p_range)
+#     probN = NonlinearProblem{false}(f, 0.5, p_range[begin])
+#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+#     sols = zeros(length(p_range))
+#     for (i, p) in enumerate(p_range)
+#         reinit!(cache, cache.u; p = p)
+#         sol = solve!(cache)
+#         sols[i] = sol.u
+#     end
+#     return sols
+# end
+# p = range(0.01, 2, length = 200)
+# @test g(p) ≈ sqrt.(p)
+
+# f = (res, u, p) -> (res[begin] = u[1] * u[1] - p)
+# g = function (p_range)
+#     probN = NonlinearProblem{true}(f, [0.5], p_range[begin])
+#     cache = init(probN, TrustRegion(); maxiters = 100, abstol = 1e-10)
+#     sols = zeros(length(p_range))
+#     for (i, p) in enumerate(p_range)
+#         reinit!(cache, [cache.u[1]]; p = p)
+#         sol = solve!(cache)
+#         sols[i] = sol.u[1]
+#     end
+#     return sols
+# end
+# p = range(0.01, 2, length = 200)
+# @test g(p) ≈ sqrt.(p)
+
+# # Error Checks
+# f, u0 = (u, p) -> u .* u .- 2, @SVector[1.0, 1.0]
+# probN = NonlinearProblem(f, u0)
+
+# @test solve(probN, TrustRegion()).u[end] ≈ sqrt(2.0)
+# @test solve(probN, TrustRegion(; autodiff = false)).u[end] ≈ sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Hei)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Hei, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Yuan)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Yuan, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Fan, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# @test solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin)).u[end] ≈
+#       sqrt(2.0)
+# @test solve(probN, TrustRegion(; radius_update_scheme = RadiusUpdateSchemes.Bastin, autodiff = false)).u[end] ≈
+#       sqrt(2.0)
+
+# for u0 in [1.0, [1, 1.0]]
+#     local f, probN, sol
+#     f = (u, p) -> u .* u .- 2.0
+#     probN = NonlinearProblem(f, u0)
+#     sol = sqrt(2) * u0
+
+#     @test solve(probN, TrustRegion()).u ≈ sol
+#     @test solve(probN, TrustRegion()).u ≈ sol
+#     @test solve(probN, TrustRegion(; autodiff = false)).u ≈ sol
+# end
+
+# # Test that `TrustRegion` passes a test that `NewtonRaphson` fails on.
+# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+# global g, f
+# f = (u, p) -> 0.010000000000000002 .+
+#               10.000000000000002 ./ (1 .+
+#                (0.21640425613334457 .+
+#                 216.40425613334457 ./ (1 .+
+#                  (0.21640425613334457 .+
+#                   216.40425613334457 ./
+#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+#               0.0011552453009332421u .- p
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(), abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Fan),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, TrustRegion(radius_update_scheme = RadiusUpdateSchemes.Bastin),
+#         abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# # Test kwars in `TrustRegion`
+# max_trust_radius = [10.0, 100.0, 1000.0]
+# initial_trust_radius = [10.0, 1.0, 0.1]
+# step_threshold = [0.0, 0.01, 0.25]
+# shrink_threshold = [0.25, 0.3, 0.5]
+# expand_threshold = [0.5, 0.8, 0.9]
+# shrink_factor = [0.1, 0.3, 0.5]
+# expand_factor = [1.5, 2.0, 3.0]
+# max_shrink_times = [10, 20, 30]
+
+# list_of_options = zip(max_trust_radius, initial_trust_radius, step_threshold,
+#     shrink_threshold, expand_threshold, shrink_factor,
+#     expand_factor, max_shrink_times)
+# for options in list_of_options
+#     local probN, sol, alg
+#     alg = TrustRegion(max_trust_radius = options[1],
+#         initial_trust_radius = options[2],
+#         step_threshold = options[3],
+#         shrink_threshold = options[4],
+#         expand_threshold = options[5],
+#         shrink_factor = options[6],
+#         expand_factor = options[7],
+#         max_shrink_times = options[8])
+
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, alg, abstol = 1e-10)
+#     @test all(abs.(f(u, p)) .< 1e-10)
+# end
+
+# # Testing consistency of iip vs oop iterations
+
+# maxiterations = [2, 3, 4, 5]
+# u0 = [1.0, 1.0]
+# function iip_oop(f, fip, u0, radius_update_scheme, maxiters)
+#     prob_iip = NonlinearProblem{true}(fip, u0)
+#     solver = init(prob_iip, TrustRegion(radius_update_scheme = radius_update_scheme),
+#         abstol = 1e-9, maxiters = maxiters)
+#     sol_iip = solve!(solver)
+
+#     prob_oop = NonlinearProblem{false}(f, u0)
+#     solver = init(prob_oop, TrustRegion(radius_update_scheme = radius_update_scheme),
+#         abstol = 1e-9, maxiters = maxiters)
+#     sol_oop = solve!(solver)
+
+#     return sol_iip.u[end], sol_oop.u[end]
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Simple, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Hei, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Yuan, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Fan, maxiters)
+#     @test iip == oop
+# end
+
+# for maxiters in maxiterations
+#     iip, oop = iip_oop(ff, ffiip, u0, RadiusUpdateSchemes.Bastin, maxiters)
+#     @test iip == oop
+# end
+
+# # --- LevenbergMarquardt tests ---
+
+# function benchmark_immutable(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function benchmark_mutable(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function benchmark_scalar(f, u0)
+#     probN = NonlinearProblem{false}(f, u0)
+#     sol = (solve(probN, LevenbergMarquardt(), abstol = 1e-9))
+# end
+
+# function ff(u, p)
+#     u .* u .- 2
+# end
+
+# function sf(u, p)
+#     u * u - 2
+# end
+# u0 = [1.0, 1.0]
+
+# sol = benchmark_immutable(ff, cu0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+# sol = benchmark_mutable(ff, u0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+# sol = benchmark_scalar(sf, csu0)
+# @test SciMLBase.successful_retcode(sol)
+# @test abs(sol.u * sol.u - 2) < 1e-9
+
+# function benchmark_inplace(f, u0)
+#     probN = NonlinearProblem{true}(f, u0)
+#     solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     sol = solve!(solver)
+# end
+
+# function ffiip(du, u, p)
+#     du .= u .* u .- 2
+# end
+# u0 = [1.0, 1.0]
+
+# sol = benchmark_inplace(ffiip, u0)
+# @test SciMLBase.successful_retcode(sol)
+# @test all(abs.(sol.u .* sol.u .- 2) .< 1e-9)
+
+# u0 = [1.0, 1.0]
+# probN = NonlinearProblem{true}(ffiip, u0)
+# solver = init(probN, LevenbergMarquardt(), abstol = 1e-9)
+# @test (@ballocated solve!(solver)) < 120
+
+# # AD Tests
+# using ForwardDiff
+
+# # Immutable
+# f, u0 = (u, p) -> u .* u .- p, @SVector[1.0, 1.0]
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, csu0, p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-9)
+#     return sol.u[end]
+# end
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# # Scalar
+# f, u0 = (u, p) -> u * u - p, 1.0
+
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, oftype(p, u0), p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
+#     return sol.u
+# end
+
+# @test ForwardDiff.derivative(g, 3.0) ≈ 1 / (2 * sqrt(3.0))
+
+# for p in 1.1:0.1:100.0
+#     @test g(p) ≈ sqrt(p)
+#     @test ForwardDiff.derivative(g, p) ≈ 1 / (2 * sqrt(p))
+# end
+
+# f = (u, p) -> p[1] * u * u - p[2]
+# t = (p) -> [sqrt(p[2] / p[1])]
+# p = [0.9, 50.0]
+# gnewton = function (p)
+#     probN = NonlinearProblem{false}(f, 0.5, p)
+#     sol = solve(probN, LevenbergMarquardt())
+#     return [sol.u]
+# end
+# @test gnewton(p) ≈ [sqrt(p[2] / p[1])]
+# @test ForwardDiff.jacobian(gnewton, p) ≈ ForwardDiff.jacobian(t, p)
+
+# # Error Checks
+# f, u0 = (u, p) -> u .* u .- 2.0, @SVector[1.0, 1.0]
+# probN = NonlinearProblem(f, u0)
+
+# @test solve(probN, LevenbergMarquardt()).u[end] ≈ sqrt(2.0)
+# @test solve(probN, LevenbergMarquardt(; autodiff = false)).u[end] ≈ sqrt(2.0)
+
+# for u0 in [1.0, [1, 1.0]]
+#     local f, probN, sol
+#     f = (u, p) -> u .* u .- 2.0
+#     probN = NonlinearProblem(f, u0)
+#     sol = sqrt(2) * u0
+
+#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
+#     @test solve(probN, LevenbergMarquardt()).u ≈ sol
+#     @test solve(probN, LevenbergMarquardt(; autodiff = false)).u ≈ sol
+# end
+
+# # Test that `LevenbergMarquardt` passes a test that `NewtonRaphson` fails on.
+# u0 = [-10.0, -1.0, 1.0, 2.0, 3.0, 4.0, 10.0]
+# global g, f
+# f = (u, p) -> 0.010000000000000002 .+
+#               10.000000000000002 ./ (1 .+
+#                (0.21640425613334457 .+
+#                 216.40425613334457 ./ (1 .+
+#                  (0.21640425613334457 .+
+#                   216.40425613334457 ./
+#                   (1 .+ 0.0006250000000000001(u .^ 2.0))) .^ 2.0)) .^ 2.0) .-
+#               0.0011552453009332421u .- p
+# g = function (p)
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, LevenbergMarquardt(), abstol = 1e-10)
+#     return sol.u
+# end
+# p = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
+# u = g(p)
+# f(u, p)
+# @test all(abs.(f(u, p)) .< 1e-10)
+
+# # # Test kwars in `LevenbergMarquardt`
+# damping_initial = [0.5, 2.0, 5.0]
+# damping_increase_factor = [1.5, 3.0, 10.0]
+# damping_decrease_factor = [2, 5, 10]
+# finite_diff_step_geodesic = [0.02, 0.2, 0.3]
+# α_geodesic = [0.6, 0.8, 0.9]
+# b_uphill = [0, 1, 2]
+# min_damping_D = [1e-12, 1e-9, 1e-4]
+
+# list_of_options = zip(damping_initial, damping_increase_factor, damping_decrease_factor,
+#     finite_diff_step_geodesic, α_geodesic, b_uphill,
+#     min_damping_D)
+# for options in list_of_options
+#     local probN, sol, alg
+#     alg = LevenbergMarquardt(damping_initial = options[1],
+#         damping_increase_factor = options[2],
+#         damping_decrease_factor = options[3],
+#         finite_diff_step_geodesic = options[4],
+#         α_geodesic = options[5],
+#         b_uphill = options[6],
+#         min_damping_D = options[7])
+
+#     probN = NonlinearProblem{false}(f, u0, p)
+#     sol = solve(probN, alg, abstol = 1e-10)
+#     @test all(abs.(f(u, p)) .< 1e-10)
+# end
diff --git a/test/runtests.jl b/test/runtests.jl
index a84fc3cb1..f8cf35db3 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -14,11 +14,11 @@ end
 @time begin
     if GROUP == "All" || GROUP == "Core"
         @time @safetestset "Basic Tests + Some AD" include("basictests.jl")
-        @time @safetestset "Sparsity Tests" include("sparse.jl")
+        # @time @safetestset "Sparsity Tests" include("sparse.jl")
     end
 
-    if GROUP == "GPU"
-        activate_downstream_env()
-        @time @safetestset "GPU Tests" include("gpu.jl")
-    end
+    # if GROUP == "GPU"
+    #     activate_downstream_env()
+    #     @time @safetestset "GPU Tests" include("gpu.jl")
+    # end
 end