diff --git a/src/first_order.rs b/src/first_order.rs index 7b7a430..65117d3 100644 --- a/src/first_order.rs +++ b/src/first_order.rs @@ -2,4 +2,4 @@ mod adaptive_descent; mod macros; mod steepest_descent; -pub use steepest_descent::{descent, Descent, DescentParameter}; +pub use steepest_descent::{descent, Armijo, DescentParameter, PowellWolfe}; diff --git a/src/first_order/adaptive_descent.rs b/src/first_order/adaptive_descent.rs index 3058017..613f770 100644 --- a/src/first_order/adaptive_descent.rs +++ b/src/first_order/adaptive_descent.rs @@ -1,8 +1,8 @@ mod adadelta; mod adagrad; -pub(crate) use adadelta::adadelta; -pub(crate) use adagrad::adagrad; +pub(crate) use adadelta::AdaDelta; +pub(crate) use adagrad::AdaGrad; pub(crate) const ACCUM_GRAD: &str = "accum_grad"; pub(crate) const ACCUM_UPDATE: &str = "accum_update"; diff --git a/src/first_order/adaptive_descent/adadelta.rs b/src/first_order/adaptive_descent/adadelta.rs index 936daf3..b2096fd 100644 --- a/src/first_order/adaptive_descent/adadelta.rs +++ b/src/first_order/adaptive_descent/adadelta.rs @@ -1,45 +1,23 @@ -use core::ops::{Add, Div, Mul}; +use core::ops::{Add, Mul}; use crate::{ first_order::{ adaptive_descent::{ACCUM_GRAD, ACCUM_UPDATE}, - macros::{descent_rule, impl_iterator_descent}, + macros::{descent_rule, impl_optimizer_descent}, }, traits::{VecDot, Vector}, - Counter, + Counter, Optimizer, }; use hashbrown::HashMap; use num_traits::{Float, One}; -pub(crate) fn adadelta( - accum_grad: &mut X, - accum_update: &X, - squared_grad: &X, - gamma: A, - epsilon: A, -) -> X -where - for<'a> A: Float + Add<&'a X, Output = X> + Mul<&'a X, Output = X> + Mul, - X: Add + Div + FromIterator + IntoIterator, -{ - *accum_grad = gamma * &*accum_grad + (A::one() - gamma) * squared_grad; - (epsilon + accum_update) - .into_iter() - .map(|g| g.sqrt()) - .collect::() - / (epsilon + &*accum_grad) - .into_iter() - .map(|g| g.sqrt()) - .collect::() -} - descent_rule!( AdaDelta, X, [].into_iter().collect::(), [(ACCUM_GRAD, X::zero(1)), (ACCUM_UPDATE, X::zero(1))].into() ); -impl_iterator_descent!(AdaDelta, X); +impl_optimizer_descent!(AdaDelta, X); impl<'a, X, F, G> AdaDelta<'a, X, F, G, X> where diff --git a/src/first_order/adaptive_descent/adagrad.rs b/src/first_order/adaptive_descent/adagrad.rs index f093e5b..77e99e8 100644 --- a/src/first_order/adaptive_descent/adagrad.rs +++ b/src/first_order/adaptive_descent/adagrad.rs @@ -1,38 +1,24 @@ -use core::ops::{Add, Div, Mul}; +use core::ops::{Add, Mul}; use num_traits::Float; use crate::{ first_order::{ adaptive_descent::ACCUM_GRAD, - macros::{descent_rule, impl_iterator_descent}, + macros::{descent_rule, impl_optimizer_descent}, }, traits::{VecDot, Vector}, - Counter, + Counter, Optimizer, }; use hashbrown::HashMap; -pub(crate) fn adagrad(accum_grad: &mut X, squared_grad: X, gamma: A, epsilon: A) -> X -where - for<'a> A: Float + Add<&'a X, Output = X> + Div, - for<'b> &'b X: Add, - X: FromIterator + IntoIterator + Clone, -{ - *accum_grad = &*accum_grad + squared_grad; - gamma - / (epsilon + &*accum_grad) - .into_iter() - .map(|g| g.sqrt()) - .collect::() -} - descent_rule!( AdaGrad, X, [].into_iter().collect::(), [(ACCUM_GRAD, X::zero(1))].into() ); -impl_iterator_descent!(AdaGrad, X); +impl_optimizer_descent!(AdaGrad, X); impl<'a, X, F, G> AdaGrad<'a, X, F, G, X> where diff --git a/src/first_order/macros.rs b/src/first_order/macros.rs index 03b4b5c..46891a4 100644 --- a/src/first_order/macros.rs +++ b/src/first_order/macros.rs @@ -72,16 +72,16 @@ macro_rules! descent_rule { }; } -macro_rules! impl_iterator_descent { +macro_rules! impl_optimizer_descent { ($rule:ident, $step:ty) => { impl<'a, X, F, G> core::iter::Iterator for $rule<'a, X, F, G, $step> where - X: Vector + VecDot, + X: Vector + VecDot + Clone, for<'b> &'b X: Add + Mul<&'b X, Output = X>, F: Fn(&X) -> X::Elem, G: Fn(&X) -> X, { - type Item = X::Elem; + type Item = X; fn next(&mut self) -> Option { if self.stop() { None @@ -91,12 +91,31 @@ macro_rules! impl_iterator_descent { self.counter.iter += 1; self.neg_gradfx = -self.grad(&self.x); self.counter.gcalls += 1; - Some(self.stop_metrics) + Some(self.x.clone()) } } } + impl<'a, X, F, G> Optimizer for $rule<'a, X, F, G, $step> + where + X: Vector + VecDot + Clone, + for<'b> &'b X: Add + Mul<&'b X, Output = X>, + F: Fn(&X) -> X::Elem, + G: Fn(&X) -> X, + { + type Iterate = X; + type Intermediate = HashMap<&'a str, $step>; + fn nb_iter(&self) -> usize { + self.counter.iter + } + fn iterate(&self) -> X { + self.x.clone() + } + fn intermediate(&self) -> Self::Intermediate { + HashMap::from([("sigma", self.sigma.clone())]) + } + } }; } pub(crate) use descent_rule; -pub(crate) use impl_iterator_descent; +pub(crate) use impl_optimizer_descent; diff --git a/src/first_order/steepest_descent.rs b/src/first_order/steepest_descent.rs index 4d2137f..2ab6ba3 100644 --- a/src/first_order/steepest_descent.rs +++ b/src/first_order/steepest_descent.rs @@ -1,18 +1,17 @@ mod armijo; mod powell_wolfe; mod unit_test; -use crate::first_order::adaptive_descent::{adadelta, adagrad}; +use crate::first_order::adaptive_descent::{AdaDelta, AdaGrad}; use crate::{ traits::{VecDot, Vector}, Number, Optimizer, TuutalError, }; -use armijo::armijo; +pub use armijo::Armijo; use core::{ fmt::Debug, ops::{Add, Mul}, }; -use num_traits::One; -use powell_wolfe::powell_wolfe; +pub use powell_wolfe::PowellWolfe; /// Parameters used in the a descent method. /// @@ -154,14 +153,6 @@ where assert!(beta > T::zero()); Self::AdaDelta { gamma, beta } } - fn step_size_is_scalar(&self) -> bool { - match self { - Self::Armijo { gamma: _, beta: _ } => true, - Self::PowellWolfe { gamma: _, beta: _ } => true, - Self::AdaGrad { gamma: _, beta: _ } => false, - Self::AdaDelta { gamma: _, beta: _ } => false, - } - } } /// A descent algorithm using some step size method. @@ -221,7 +212,7 @@ pub fn descent( gradf: G, x0: &X, params: &DescentParameter, - eps: X::Elem, + gtol: X::Elem, maxiter: usize, ) -> Result> where @@ -230,169 +221,18 @@ where F: Fn(&X) -> X::Elem, G: Fn(&X) -> X, { - let mut desc = Descent::new(f, gradf, x0.clone(), *params, eps); - desc.optimize(Some(maxiter)) -} - -/// Represents the sequence of iterates computed by a steepest descent algorithm. -pub struct Descent -where - X: Vector, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, -{ - f: F, - gradf: G, - params: DescentParameter, - x: X, - eps: X::Elem, - iter: usize, - sigma: X, - accum_grad: X, - accum_update: X, - fcalls: usize, -} - -impl Descent -where - X: Vector, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, -{ - pub fn new(f: F, gradf: G, x: X, params: DescentParameter, eps: X::Elem) -> Self - where - X: Vector, - for<'a> &'a X: Add, - { - let dim = x.len(); - if params.step_size_is_scalar() { - Self { - f, - gradf, - params, - x, - iter: 0, - eps, - sigma: X::zero(1), - accum_grad: X::zero(1), - accum_update: X::zero(1), - fcalls: 0, - } - } else { - Self { - f, - gradf, - params, - x, - iter: 0, - eps, - sigma: X::zero(dim), - accum_grad: X::zero(dim), - accum_update: X::zero(dim), - fcalls: 0, - } + match params { + DescentParameter::Armijo { gamma, beta } => { + Armijo::new(f, gradf, x0.clone(), *gamma, *beta, gtol).optimize(Some(maxiter)) } - } - /// Reference to the objective function - pub(crate) fn obj(&self) -> &F { - &self.f - } - /// Reference to the gradient of the objective function - pub(crate) fn grad_obj(&self) -> &G { - &self.gradf - } - /// Number of iterations done so far. - pub fn nb_iter(&self) -> usize { - self.iter - } - /// Current iterate. - pub fn x(&self) -> &X { - &self.x - } - /// Current step size. - pub fn sigma(&self) -> &X { - &self.sigma - } -} - -impl core::iter::Iterator for Descent -where - X: Vector + Clone + VecDot, - for<'a> &'a X: Add + Mul<&'a X, Output = X> + Mul, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, -{ - type Item = X; - fn next(&mut self) -> Option { - let neg_gradfx = -self.grad_obj()(&self.x); - let squared_norm_2_gradfx = neg_gradfx.dot(&neg_gradfx); - if squared_norm_2_gradfx <= (self.eps * self.eps) { - self.iter += 1; - None - } else { - let mut fcalls = self.fcalls; - self.sigma = match self.params { - DescentParameter::Armijo { gamma, beta } => [armijo( - self.obj(), - &self.x, - &neg_gradfx, - squared_norm_2_gradfx, - gamma, - beta, - &mut fcalls, - ) - .1] - .into_iter() - .collect::(), - DescentParameter::PowellWolfe { gamma, beta } => [powell_wolfe( - (self.obj(), self.grad_obj()), - &self.x, - &neg_gradfx, - squared_norm_2_gradfx, - gamma, - beta, - &mut fcalls, - ) - .1] - .into_iter() - .collect::(), - DescentParameter::AdaGrad { gamma, beta } => { - let squared_grad = &neg_gradfx * &neg_gradfx; - adagrad(&mut self.accum_grad, squared_grad, gamma, beta) - } - DescentParameter::AdaDelta { gamma, beta } => { - let squared_grad = &neg_gradfx * &neg_gradfx; - let step_size = adadelta( - &mut self.accum_grad, - &self.accum_update, - &squared_grad, - gamma, - beta, - ); - self.accum_update = gamma * &self.accum_update - + (X::Elem::one() - gamma) * (&step_size * &step_size) * squared_grad; - step_size - } - }; - self.x = &self.x + &self.sigma * neg_gradfx; - self.iter += 1; - Some(self.x.clone()) + DescentParameter::PowellWolfe { gamma, beta } => { + PowellWolfe::new(f, gradf, x0.clone(), *gamma, *beta, gtol).optimize(Some(maxiter)) + } + DescentParameter::AdaDelta { gamma, beta } => { + AdaDelta::new(f, gradf, x0.clone(), *gamma, *beta, gtol).optimize(Some(maxiter)) + } + DescentParameter::AdaGrad { gamma, beta } => { + AdaGrad::new(f, gradf, x0.clone(), *gamma, *beta, gtol).optimize(Some(maxiter)) } - } -} - -impl Optimizer for Descent -where - X: Vector + Clone + VecDot, - for<'a> &'a X: Add + Mul<&'a X, Output = X> + Mul, - F: Fn(&X) -> X::Elem, - G: Fn(&X) -> X, -{ - type Iterate = X; - fn nb_iter(&self) -> usize { - self.nb_iter() - } - fn iterate(&self) -> X { - self.x.clone() } } diff --git a/src/first_order/steepest_descent/armijo.rs b/src/first_order/steepest_descent/armijo.rs index 36cc145..6ca8364 100644 --- a/src/first_order/steepest_descent/armijo.rs +++ b/src/first_order/steepest_descent/armijo.rs @@ -1,48 +1,20 @@ use core::ops::{Add, Mul}; use crate::{ - first_order::macros::{descent_rule, impl_iterator_descent}, - traits::{Scalar, VecDot, Vector}, - Counter, + first_order::macros::{descent_rule, impl_optimizer_descent}, + traits::{VecDot, Vector}, + Counter, Optimizer, }; use hashbrown::HashMap; use num_traits::{Float, One, Zero}; -/// Computes a step size using the Armijo method. -pub(crate) fn armijo( - f: &F, - x: &X, - neg_gradfx: &X, - squared_norm_2_gradfx: A, - gamma: A, - beta: A, - fcalls: &mut usize, -) -> (X, A) -where - A: Scalar, - F: Fn(&X) -> A, - X: VecDot + Add, - for<'a> &'a X: Add, -{ - let mut sigma = A::one(); - let mut x_next = x + sigma * neg_gradfx; - let fx = f(x); - *fcalls += 1; - while f(&x_next) - fx > -sigma * gamma * squared_norm_2_gradfx { - sigma = beta * sigma; - x_next = x + sigma * neg_gradfx; - *fcalls += 1; - } - (x_next, sigma) -} - descent_rule!( Armijo, [::Elem; 1], [X::Elem::zero()], [].into() ); -impl_iterator_descent!(Armijo, [::Elem; 1]); +impl_optimizer_descent!(Armijo, [::Elem; 1]); impl<'a, X, F, G> Armijo<'a, X, F, G, [X::Elem; 1]> where @@ -64,6 +36,6 @@ where x_next = &self.x + sigma * &self.neg_gradfx; } self.x = x_next; - self.sigma = [sigma]; + self.sigma[0] = sigma; } } diff --git a/src/first_order/steepest_descent/powell_wolfe.rs b/src/first_order/steepest_descent/powell_wolfe.rs index df6ac92..c651a62 100644 --- a/src/first_order/steepest_descent/powell_wolfe.rs +++ b/src/first_order/steepest_descent/powell_wolfe.rs @@ -1,86 +1,20 @@ use core::ops::{Add, Mul}; use crate::{ - first_order::macros::{descent_rule, impl_iterator_descent}, + first_order::macros::{descent_rule, impl_optimizer_descent}, traits::{Number, VecDot, Vector}, - Counter, Scalar, + Counter, Optimizer, }; use hashbrown::HashMap; use num_traits::{Float, One, Zero}; -/// Computes a step size using the Powell Wolfe method. -pub(crate) fn powell_wolfe( - funcs: (&F, &G), - x: &X, - neg_gradfx: &X, - squared_norm_2_gradfx: A, - gamma: A, - beta: A, - fcalls: &mut usize, -) -> (X, A) -where - A: Scalar, - F: Fn(&X) -> A, - G: Fn(&X) -> X, - X: VecDot + Add, - for<'a> &'a X: Add, -{ - let (f, gradf) = funcs; - let mut sigma_minus = A::one(); - let mut x_next = x + sigma_minus * neg_gradfx; - let one_half = A::cast_from_f32(0.5); - let fx = f(x); - *fcalls += 1; - // The first if and else conditions guarantee having a segment [sigma_minus, sigma_plus] - // such that sigma_minus satisfies the armijo condition and sigma_plus does not - let mut sigma_plus = if f(&x_next) - fx <= -sigma_minus * gamma * squared_norm_2_gradfx { - *fcalls += 1; - if gradf(&x_next).dot(neg_gradfx) >= -beta * squared_norm_2_gradfx { - return (x_next, sigma_minus); - } - // Computation of sigma_plus - let two = A::cast_from_f32(2.); - let mut sigma_plus = two; - x_next = x + sigma_plus * neg_gradfx; - while f(&x_next) - fx <= -sigma_plus * gamma * squared_norm_2_gradfx { - sigma_plus = two * sigma_plus; - x_next = x + sigma_plus * neg_gradfx; - *fcalls += 1; - } - // At this stage sigma_plus is the smallest 2^k that does not satisfy the Armijo rule - sigma_minus = sigma_plus * one_half; // it satisfies the Armijo rule - sigma_plus - } else { - sigma_minus = one_half; - x_next = x + sigma_minus * neg_gradfx; - while f(&x_next) - fx > -sigma_minus * gamma * squared_norm_2_gradfx { - sigma_minus = one_half * sigma_minus; - x_next = x + sigma_minus * neg_gradfx; - *fcalls += 1; - } - sigma_minus * (A::cast_from_f32(2.)) // does not satisfy the Armijo rule - }; - x_next = x + sigma_minus * neg_gradfx; - while gradf(&x_next).dot(neg_gradfx) < -beta * squared_norm_2_gradfx { - let sigma = (sigma_minus + sigma_plus) * one_half; - x_next = x + sigma * neg_gradfx; - if f(&x_next) - fx <= -sigma * gamma * squared_norm_2_gradfx { - sigma_minus = sigma; - } else { - sigma_plus = sigma; - } - *fcalls += 1; - } - (x + sigma_minus * neg_gradfx, sigma_minus) -} - descent_rule!( PowellWolfe, [::Elem; 1], [X::Elem::zero()], [].into() ); -impl_iterator_descent!(PowellWolfe, [X::Elem; 1]); +impl_optimizer_descent!(PowellWolfe, [X::Elem; 1]); impl<'a, X, F, G> PowellWolfe<'a, X, F, G, [X::Elem; 1]> where diff --git a/src/lib.rs b/src/lib.rs index e1374cb..1124147 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -23,7 +23,7 @@ mod traits; mod utils; pub use error::{RootFindingError, TuutalError}; -pub use first_order::{descent, Descent, DescentParameter}; +pub use first_order::{descent, Armijo, DescentParameter, PowellWolfe}; pub use ndarray::{array, s, Array1, Array2}; use num_traits::Num; diff --git a/src/traits.rs b/src/traits.rs index ca391ea..391716b 100644 --- a/src/traits.rs +++ b/src/traits.rs @@ -100,6 +100,7 @@ where /// Implements an iterator counting the number of iterations done so far and a full optimization routine. pub trait Optimizer: core::iter::Iterator { type Iterate; + type Intermediate; /// Number of iterations done so far. fn nb_iter(&self) -> usize; /// Current iterate. @@ -120,6 +121,7 @@ pub trait Optimizer: core::iter::Iterator { } Ok(self.iterate()) } + fn intermediate(&self) -> Self::Intermediate; } /// Implements the notion of upper and lower bounds diff --git a/src/zero_order/nelder_mead.rs b/src/zero_order/nelder_mead.rs index 7821d7c..1910da9 100644 --- a/src/zero_order/nelder_mead.rs +++ b/src/zero_order/nelder_mead.rs @@ -606,10 +606,12 @@ where A: Scalar>, { type Iterate = Array1; + type Intermediate = (); fn nb_iter(&self) -> usize { self.iter } fn iterate(&self) -> Array1 { self.sim.row(0).to_owned() } + fn intermediate(&self) {} } diff --git a/src/zero_order/powell.rs b/src/zero_order/powell.rs index d12b1b8..e3a80f7 100644 --- a/src/zero_order/powell.rs +++ b/src/zero_order/powell.rs @@ -400,10 +400,12 @@ where F: Fn(&Array1) -> A, { type Iterate = Array1; + type Intermediate = (); fn nb_iter(&self) -> usize { self.iter } fn iterate(&self) -> Array1 { self.x.clone() } + fn intermediate(&self) {} } diff --git a/tests/quickckeck.rs b/tests/quickckeck.rs index a0a1d30..de674fe 100644 --- a/tests/quickckeck.rs +++ b/tests/quickckeck.rs @@ -1,10 +1,9 @@ -// #![cfg(feature = "quickcheck")] #[macro_use] extern crate quickcheck; extern crate tuutal; use core::cmp::min; -use tuutal::{s, Array1, Descent, DescentParameter}; +use tuutal::{s, Armijo, Array1, Optimizer, PowellWolfe}; quickcheck! { fn descent_armijo(xs: Vec) -> bool { @@ -23,8 +22,7 @@ quickcheck! { let eye = Array1::ones(arr.shape()[0]); let f = |x: &Array1| 0.5 * x.dot(x).powi(2) + eye.dot(x) + 1.; let gradf = |x: &Array1| 2. * x * x.dot(x) + eye.clone(); - let param = DescentParameter::new_armijo(0.01, 0.5); - let mut iterates = Descent::new(f, gradf, arr.to_owned(), param, 1e-3); + let mut iterates = Armijo::new(f, gradf, arr.to_owned(), 0.01, 0.5, 1e-3); let mut x_prev = arr.to_owned(); let mut iter = 1; while let Some(x) = iterates.next() { @@ -58,15 +56,15 @@ quickcheck! { let gradf = |x: &Array1| 2. * x * x.dot(x) ; let gamma = 0.001; let beta = 0.9; - let param = DescentParameter::new_powell_wolfe(gamma, beta); - let mut iterates = Descent::new(f, gradf, arr.to_owned(), param, 1e-3); + let mut iterates = PowellWolfe::new(f, gradf, arr.to_owned(), gamma, beta, 1e-3); let mut x_prev = arr.to_owned(); let mut iter = 1; while let Some(x_next) = iterates.next() { // let gradfx_next = gradf(&x_next); let neg_gradfx_prev = -gradf(&x_prev); let gradfx_d = neg_gradfx_prev.dot(&neg_gradfx_prev); - assert!(f(&x_next) <= f(&x_prev) - iterates.sigma()[0] * gamma * gradfx_d); + let intermed = iterates.intermediate(); + assert!(f(&x_next) <= f(&x_prev) - intermed["sigma"][0] * gamma * gradfx_d); x_prev = x_next; iter += 1; if iter > 10 {