From bb49f9e129b9ccd513087e964c056aa53f186356 Mon Sep 17 00:00:00 2001
From: Ben Ruijl <benruyl@gmail.com>
Date: Fri, 16 Aug 2024 12:00:11 +0200
Subject: [PATCH] Add new expression evaluator to Python API

- Fix ASM output for repeated register occurrences
---
 examples/nested_evaluation.rs |  68 +----
 src/api/python.rs             | 458 +++++++++++++++++++++++++++++-----
 src/evaluate.rs               | 135 +++++++---
 src/id.rs                     |  20 +-
 symbolica.pyi                 | 118 ++++++++-
 5 files changed, 627 insertions(+), 172 deletions(-)
diff --git a/examples/nested_evaluation.rs b/examples/nested_evaluation.rs
index 488341d..59a2d36 100644
--- a/examples/nested_evaluation.rs
+++ b/examples/nested_evaluation.rs
@@ -1,9 +1,7 @@
-use std::time::Instant;
-
 use symbolica::{
-    atom::{Atom, AtomView},
-    domains::{float::Complex, rational::Rational},
-    evaluate::{CompileOptions, ExpressionEvaluator, FunctionMap, InlineASM, OptimizationSettings},
+    atom::Atom,
+    domains::rational::Rational,
+    evaluate::{CompileOptions, FunctionMap, InlineASM, OptimizationSettings},
     state::State,
 };
 
@@ -73,64 +71,20 @@ fn main() {
         OptimizationSettings::default(),
     )
     .unwrap();
+
     let mut e_f64 = evaluator.map_coeff(&|x| x.into());
-    let r = e_f64.evaluate(&[5.]);
+    let r = e_f64.evaluate_single(&[5.]);
     println!("{}", r);
 
-    let mut tree =
-        AtomView::to_eval_tree_multiple(&[e1.as_view(), e2.as_view()], &fn_map, &params).unwrap();
-
-    // optimize the tree using an occurrence-order Horner scheme
-    println!("Op original {:?}", tree.count_operations());
-    tree.horner_scheme();
-    println!("Op horner {:?}", tree.count_operations());
-    tree.common_subexpression_elimination();
-    println!("op cse {:?}", tree.count_operations());
-
-    //tree.common_pair_elimination();
-    //println!("op cpe {:?}", tree.count_operations());
-
-    let mut evaluator: ExpressionEvaluator<f64> = tree.linearize(None).map_coeff(&|x| x.into());
-
-    let ce = evaluator
-        .export_cpp(
-            "nested_evaluation.cpp",
-            "evaltest",
-            true,
-            InlineASM::default(),
-        )
+    let mut compiled = e_f64
+        .export_cpp("nested_evaluate.cpp", "nested", true, InlineASM::Intel)
         .unwrap()
-        .compile("libneval.so", CompileOptions::default())
+        .compile("nested", CompileOptions::default())
         .unwrap()
         .load()
         .unwrap();
 
-    let params = vec![5.];
-    let mut out = vec![0., 0.];
-    ce.evaluate(&params, &mut out);
-    println!("Eval from C++: {}, {}", out[0], out[1]);
-
-    {
-        let params = vec![Complex::new(5., 0.)];
-        let mut out = vec![Complex::new_zero(), Complex::new_zero()];
-        ce.evaluate(&params, &mut out);
-        println!("Eval from C++: {}, {}", out[0], out[1]);
-    }
-
-    // benchmark
-    let t = Instant::now();
-    for _ in 0..1000000 {
-        let _ = ce.evaluate(&params, &mut out);
-    }
-    println!("C++ time {:#?}", t.elapsed());
-
-    evaluator.evaluate_multiple(&params, &mut out);
-    println!("Eval: {}, {}", out[0], out[1]);
-
-    let params = vec![5.];
-    let t = Instant::now();
-    for _ in 0..1000000 {
-        evaluator.evaluate_multiple(&params, &mut out);
-    }
-    println!("Eager time {:#?}", t.elapsed());
+    let mut out = vec![0.];
+    compiled.evaluate(&[5.], &mut out);
+    println!("{}", out[0]);
 }
diff --git a/src/api/python.rs b/src/api/python.rs
index 999cd7a..2db6631 100644
--- a/src/api/python.rs
+++ b/src/api/python.rs
@@ -38,7 +38,10 @@ use crate::{
         },
         Ring,
     },
-    evaluate::EvaluationFn,
+    evaluate::{
+        CompileOptions, CompiledEvaluator, EvaluationFn, ExpressionEvaluator, FunctionMap,
+        InlineASM, OptimizationSettings,
+    },
     id::{
         Condition, Match, MatchSettings, MatchStack, Pattern, PatternAtomTreeIterator,
         PatternRestriction, ReplaceIterator, Replacement, WildcardAndRestriction,
@@ -46,15 +49,8 @@ use crate::{
     numerical_integration::{ContinuousGrid, DiscreteGrid, Grid, MonteCarloRng, Sample},
     parser::Token,
     poly::{
-        evaluate::{
-            InstructionEvaluator, InstructionSetMode, InstructionSetModeCPPSettings,
-            InstructionSetPrinter,
-        },
-        factor::Factorize,
-        groebner::GroebnerBasis,
-        polynomial::MultivariatePolynomial,
-        series::Series,
-        GrevLexOrder, LexOrder, Variable, INLINED_EXPONENTS,
+        factor::Factorize, groebner::GroebnerBasis, polynomial::MultivariatePolynomial,
+        series::Series, GrevLexOrder, LexOrder, Variable, INLINED_EXPONENTS,
     },
     printer::{
         AtomPrinter, MatrixPrinter, PolynomialPrinter, PrintOptions, RationalPolynomialPrinter,
@@ -82,7 +78,8 @@ fn symbolica(_py: Python, m: &PyModule) -> PyResult<()> {
     m.add_class::<PythonAtomType>()?;
     m.add_class::<PythonAtomTree>()?;
     m.add_class::<PythonReplacement>()?;
-    m.add_class::<PythonInstructionEvaluator>()?;
+    m.add_class::<PythonExpressionEvaluator>()?;
+    m.add_class::<PythonCompiledExpressionEvaluator>()?;
     m.add_class::<PythonRandomNumberGenerator>()?;
     m.add_class::<PythonPatternRestriction>()?;
     m.add_class::<PythonTermStreamer>()?;
@@ -2354,9 +2351,7 @@ impl PythonExpression {
             condition: (
                 id,
                 PatternRestriction::Filter(Box::new(move |m| {
-                    let mut a = Atom::default();
-                    m.to_atom(&mut a);
-                    let data: PythonExpression = a.into();
+                    let data: PythonExpression = m.to_atom().into();
 
                     Python::with_gil(|py| {
                         filter_fn
@@ -2518,12 +2513,8 @@ impl PythonExpression {
                 PatternRestriction::Cmp(
                     other_id,
                     Box::new(move |m1, m2| {
-                        let mut a = Atom::default();
-                        m1.to_atom(&mut a);
-                        let data1: PythonExpression = a.clone().into();
-
-                        m2.to_atom(&mut a);
-                        let data2: PythonExpression = a.into();
+                        let data1: PythonExpression = m1.to_atom().into();
+                        let data2: PythonExpression = m2.to_atom().into();
 
                         Python::with_gil(|py| {
                             cmp_fn
@@ -3649,6 +3640,188 @@ impl PythonExpression {
             .evaluate(|x| x.into(), &constants, &functions, &mut cache);
         Ok(PyComplex::from_doubles(py, r.re, r.im))
     }
+
+    /// Create an evaluator that can evaluate (nested) expressions in an optimized fashion.
+    /// All constants and functions should be provided as dictionaries, where the function
+    /// dictionary has a key `(name, printable name, arguments)` and the value is the function
+    /// body. For example the function `f(x,y)=x^2+y` should be provided as
+    /// `{(f, "f", (x, y)): x**2 + y}`. All free parameters should be provided in the `params` list.
+    ///
+    /// Examples
+    /// --------
+    /// >>> from symbolica import *
+    /// >>> x, y, z, pi, f, g = Expression.symbols(
+    /// >>>     'x', 'y', 'z', 'pi', 'f', 'g')
+    /// >>>
+    /// >>> e1 = Expression.parse("x + pi + cos(x) + f(g(x+1),x*2)")
+    /// >>> fd = Expression.parse("y^2 + z^2*y^2")
+    /// >>> gd = Expression.parse("y + 5")
+    /// >>>
+    /// >>> ev = e1.evaluator({pi: Expression.num(22)/7},
+    /// >>>              {(f, "f", (y, z)): fd, (g, "g", (y, )): gd}, [x])
+    /// >>> res = ev.evaluate([[1.], [2.], [3.]])  # evaluate at x=1, x=2, x=3
+    /// >>> print(res)
+    #[pyo3(signature =
+        (constants,
+        functions,
+        params,
+        iterations = 100,
+        n_cores = 4,
+        verbose = false),
+        )]
+    pub fn evaluator(
+        &self,
+        constants: HashMap<PythonExpression, PythonExpression>,
+        functions: HashMap<(Variable, String, Vec<Variable>), PythonExpression>,
+        params: Vec<PythonExpression>,
+        iterations: usize,
+        n_cores: usize,
+        verbose: bool,
+    ) -> PyResult<PythonExpressionEvaluator> {
+        let mut fn_map = FunctionMap::new();
+
+        for (k, v) in &constants {
+            if let Ok(r) = v.expr.clone().try_into() {
+                fn_map.add_constant(k.expr.as_view(), r);
+            } else {
+                Err(exceptions::PyValueError::new_err(format!(
+                        "Constants must be rationals. If this is not possible, pass the value as a parameter",
+                    )))?
+            }
+        }
+
+        for ((symbol, rename, args), body) in &functions {
+            let symbol = symbol
+                .to_id()
+                .ok_or(exceptions::PyValueError::new_err(format!(
+                    "Bad function name {}",
+                    symbol
+                )))?;
+            let args: Vec<_> = args
+                .iter()
+                .map(|x| {
+                    x.to_id().ok_or(exceptions::PyValueError::new_err(format!(
+                        "Bad function name {}",
+                        symbol
+                    )))
+                })
+                .collect::<Result<_, _>>()?;
+
+            fn_map
+                .add_function(symbol, rename.clone(), args, body.expr.as_view())
+                .map_err(|e| {
+                    exceptions::PyValueError::new_err(format!("Could not add function: {}", e))
+                })?;
+        }
+
+        let settings = OptimizationSettings {
+            horner_iterations: iterations,
+            n_cores,
+            verbose,
+            ..OptimizationSettings::default()
+        };
+
+        let params: Vec<_> = params.iter().map(|x| x.expr.clone()).collect();
+
+        let eval = self
+            .expr
+            .evaluator(&fn_map, &params, settings)
+            .map_err(|e| {
+                exceptions::PyValueError::new_err(format!("Could not create evaluator: {}", e))
+            })?;
+
+        let eval_f64 = eval.map_coeff(&|x| x.to_f64());
+
+        Ok(PythonExpressionEvaluator { eval: eval_f64 })
+    }
+
+    /// Create an evaluator that can jointly evaluate (nested) expressions in an optimized fashion.
+    /// See `Expression.evaluator()` for more information.
+    ///
+    /// Examples
+    /// --------
+    /// >>> from symbolica import *
+    /// >>> x = Expression.symbols('x')
+    /// >>> e1 = Expression.parse("x^2 + 1")
+    /// >>> e2 = Expression.parse("x^2 + 2)
+    /// >>> ev = Expression.evaluator_multiple([e1, e2], {}, {}, [x])
+    ///
+    /// will recycle the `x^2`
+    #[classmethod]
+    #[pyo3(signature =
+        (exprs,
+        constants,
+        functions,
+        params,
+        iterations = 100,
+        n_cores = 4,
+        verbose = false),
+        )]
+    pub fn evaluator_multiple(
+        _cls: &PyType,
+        exprs: Vec<PythonExpression>,
+        constants: HashMap<PythonExpression, PythonExpression>,
+        functions: HashMap<(Variable, String, Vec<Variable>), PythonExpression>,
+        params: Vec<PythonExpression>,
+        iterations: usize,
+        n_cores: usize,
+        verbose: bool,
+    ) -> PyResult<PythonExpressionEvaluator> {
+        let mut fn_map = FunctionMap::new();
+
+        for (k, v) in &constants {
+            if let Ok(r) = v.expr.clone().try_into() {
+                fn_map.add_constant(k.expr.as_view(), r);
+            } else {
+                Err(exceptions::PyValueError::new_err(format!(
+                    "Constants must be rationals. If this is not possible, pass the value as a parameter",
+                )))?
+            }
+        }
+
+        for ((symbol, rename, args), body) in &functions {
+            let symbol = symbol
+                .to_id()
+                .ok_or(exceptions::PyValueError::new_err(format!(
+                    "Bad function name {}",
+                    symbol
+                )))?;
+            let args: Vec<_> = args
+                .iter()
+                .map(|x| {
+                    x.to_id().ok_or(exceptions::PyValueError::new_err(format!(
+                        "Bad function name {}",
+                        symbol
+                    )))
+                })
+                .collect::<Result<_, _>>()?;
+
+            fn_map
+                .add_function(symbol, rename.clone(), args, body.expr.as_view())
+                .map_err(|e| {
+                    exceptions::PyValueError::new_err(format!("Could not add function: {}", e))
+                })?;
+        }
+
+        let settings = OptimizationSettings {
+            horner_iterations: iterations,
+            n_cores,
+            verbose,
+            ..OptimizationSettings::default()
+        };
+
+        let params: Vec<_> = params.iter().map(|x| x.expr.clone()).collect();
+
+        let exprs = exprs.iter().map(|x| x.expr.as_view()).collect::<Vec<_>>();
+
+        let eval = Atom::evaluator_multiple(&exprs, &fn_map, &params, settings).map_err(|e| {
+            exceptions::PyValueError::new_err(format!("Could not create evaluator: {}", e))
+        })?;
+
+        let eval_f64 = eval.map_coeff(&|x| x.to_f64());
+
+        Ok(PythonExpressionEvaluator { eval: eval_f64 })
+    }
 }
 
 /// A raplacement, which is a pattern and a right-hand side, with optional conditions and settings.
@@ -4160,13 +4333,7 @@ impl PythonMatchIterator {
             i.next().map(|m| {
                 m.match_stack
                     .into_iter()
-                    .map(|m| {
-                        (Atom::new_var(m.0).into(), {
-                            let mut a = Atom::default();
-                            m.1.to_atom(&mut a);
-                            a.into()
-                        })
-                    })
+                    .map(|m| (Atom::new_var(m.0).into(), { m.1.to_atom().into() }))
                     .collect()
             })
         })
@@ -4762,41 +4929,6 @@ impl PythonPolynomial {
         }
     }
 
-    /// Optimize the polynomial for evaluation using `iterations` number of iterations.
-    /// The optimized output can be exported in a C++ format using `to_file`.
-    ///
-    /// Returns an evaluator for the polynomial.
-    #[pyo3(signature = (iterations = 1000, to_file = None))]
-    pub fn optimize(
-        &self,
-        iterations: usize,
-        to_file: Option<String>,
-    ) -> PyResult<PythonInstructionEvaluator> {
-        let o = self.poly.optimize(iterations);
-        if let Some(file) = to_file.as_ref() {
-            std::fs::write(
-                file,
-                format!(
-                    "{}",
-                    InstructionSetPrinter {
-                        name: "evaluate".to_string(),
-                        instr: &o,
-                        mode: InstructionSetMode::CPP(InstructionSetModeCPPSettings {
-                            write_header_and_test: true,
-                            always_pass_output_array: false,
-                        })
-                    }
-                ),
-            )
-            .unwrap();
-        }
-
-        let o_f64 = o.convert::<f64>();
-        Ok(PythonInstructionEvaluator {
-            instr: o_f64.evaluator(),
-        })
-    }
-
     /// Compute the Groebner basis of a polynomial system.
     ///
     /// If `grevlex=True`, reverse graded lexicographical ordering is used,
@@ -8272,23 +8404,211 @@ impl ConvertibleToRationalPolynomial {
     }
 }
 
+/// An optimized evaluator for expressions.
 #[pyclass(name = "Evaluator", module = "symbolica")]
 #[derive(Clone)]
-pub struct PythonInstructionEvaluator {
-    pub instr: InstructionEvaluator<f64>,
+pub struct PythonExpressionEvaluator {
+    pub eval: ExpressionEvaluator<f64>,
+}
+
+/// A compiled and optimized evaluator for expressions.
+#[pyclass(name = "CompiledEvaluator", module = "symbolica")]
+#[derive(Clone)]
+pub struct PythonCompiledExpressionEvaluator {
+    pub eval: CompiledEvaluator,
+    pub output_len: usize,
+}
+
+#[pymethods]
+impl PythonCompiledExpressionEvaluator {
+    /// Load a compiled library, previously generated with `compile`.
+    #[classmethod]
+    fn load(
+        _cls: &PyType,
+        filename: &str,
+        function_name: &str,
+        output_len: usize,
+    ) -> PyResult<Self> {
+        Ok(Self {
+            eval: CompiledEvaluator::load(filename, function_name)
+                .map_err(|e| exceptions::PyValueError::new_err(format!("Load error: {}", e)))?,
+            output_len,
+        })
+    }
+
+    /// Evaluate the expression for multiple inputs and return the result.
+    fn evaluate_single(&mut self, inputs: Vec<Vec<f64>>) -> Vec<f64> {
+        let mut res = vec![0.; self.output_len];
+        inputs
+            .iter()
+            .map(|s| {
+                self.eval.evaluate(s, &mut res);
+                res[0]
+            })
+            .collect()
+    }
+
+    /// Evaluate the expression for multiple inputs and return the result.
+    fn evaluate_complex_single<'py>(
+        &mut self,
+        py: Python<'py>,
+        inputs: Vec<Vec<Complex<f64>>>,
+    ) -> Vec<&'py PyComplex> {
+        let mut res = vec![Complex::new_zero(); self.output_len];
+        inputs
+            .iter()
+            .map(|s| {
+                self.eval.evaluate(s, &mut res);
+                PyComplex::from_doubles(py, res[0].re, res[0].im)
+            })
+            .collect()
+    }
+
+    /// Evaluate the expression for multiple inputs and return the results.
+    fn evaluate(&mut self, inputs: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
+        inputs
+            .iter()
+            .map(|s| {
+                let mut v = vec![0.; self.output_len];
+                self.eval.evaluate(s, &mut v);
+                v
+            })
+            .collect()
+    }
+
+    /// Evaluate the expression for multiple inputs and return the results.
+    fn evaluate_complex<'py>(
+        &mut self,
+        python: Python<'py>,
+        inputs: Vec<Vec<Complex<f64>>>,
+    ) -> Vec<Vec<&'py PyComplex>> {
+        let mut v = vec![Complex::new_zero(); self.output_len];
+        inputs
+            .iter()
+            .map(|s| {
+                self.eval.evaluate(s, &mut v);
+                v.iter()
+                    .map(|x| PyComplex::from_doubles(python, x.re, x.im))
+                    .collect()
+            })
+            .collect()
+    }
 }
 
 #[pymethods]
-impl PythonInstructionEvaluator {
-    /// Evaluate the polynomial for multiple inputs and return the result.
-    fn evaluate(&self, inputs: Vec<Vec<f64>>) -> Vec<f64> {
-        let mut eval = self.instr.clone();
+impl PythonExpressionEvaluator {
+    /// Evaluate the expression for multiple inputs and return the result.
+    fn evaluate_single(&mut self, inputs: Vec<Vec<f64>>) -> Vec<f64> {
+        let mut res = vec![0.];
+        inputs
+            .iter()
+            .map(|s| {
+                self.eval.evaluate(s, &mut res);
+                res[0]
+            })
+            .collect()
+    }
+
+    /// Evaluate the expression for multiple inputs and return the result.
+    fn evaluate_complex_single<'py>(
+        &mut self,
+        py: Python<'py>,
+        inputs: Vec<Vec<Complex<f64>>>,
+    ) -> Vec<&'py PyComplex> {
+        // FIXME: clone every time
+        let mut eval = self.eval.clone().map_coeff(&|x| Complex::new(*x, 0.));
+
+        let mut res = vec![Complex::new_zero()];
+        inputs
+            .iter()
+            .map(|s| {
+                eval.evaluate(s, &mut res);
+                PyComplex::from_doubles(py, res[0].re, res[0].im)
+            })
+            .collect()
+    }
 
+    /// Evaluate the expression for multiple inputs and return the results.
+    fn evaluate(&mut self, inputs: Vec<Vec<f64>>) -> Vec<Vec<f64>> {
         inputs
             .iter()
-            .map(|s| eval.evaluate_with_input(s)[0])
+            .map(|s| {
+                let mut v = vec![0.; self.eval.get_output_len()];
+                self.eval.evaluate(s, &mut v);
+                v
+            })
             .collect()
     }
+
+    /// Evaluate the expression for multiple inputs and return the results.
+    fn evaluate_complex<'py>(
+        &mut self,
+        python: Python<'py>,
+        inputs: Vec<Vec<Complex<f64>>>,
+    ) -> Vec<Vec<&'py PyComplex>> {
+        let mut eval = self.eval.clone().map_coeff(&|x| Complex::new(*x, 0.));
+
+        let mut v = vec![Complex::new_zero(); self.eval.get_output_len()];
+        inputs
+            .iter()
+            .map(|s| {
+                eval.evaluate(s, &mut v);
+                v.iter()
+                    .map(|x| PyComplex::from_doubles(python, x.re, x.im))
+                    .collect()
+            })
+            .collect()
+    }
+
+    /// Compile the evaluator to a shared library using C++ and optionally inline assembly and load it.
+    #[pyo3(signature =
+        (function_name,
+        filename,
+        library_name,
+        inline_asm = true,
+        optimization_level = 3,
+        compiler_path = None,
+    ))]
+    fn compile(
+        &self,
+        function_name: &str,
+        filename: &str,
+        library_name: &str,
+        inline_asm: bool,
+        optimization_level: u8,
+        compiler_path: Option<&str>,
+    ) -> PyResult<PythonCompiledExpressionEvaluator> {
+        let mut options = CompileOptions::default();
+        options.optimization_level = optimization_level as usize;
+        if let Some(compiler_path) = compiler_path {
+            options.compiler = compiler_path.to_string();
+        }
+
+        Ok(PythonCompiledExpressionEvaluator {
+            eval: self
+                .eval
+                .export_cpp(
+                    filename,
+                    function_name,
+                    true,
+                    if inline_asm {
+                        InlineASM::Intel
+                    } else {
+                        InlineASM::None
+                    },
+                )
+                .map_err(|e| exceptions::PyValueError::new_err(format!("Export error: {}", e)))?
+                .compile(library_name, options)
+                .map_err(|e| {
+                    exceptions::PyValueError::new_err(format!("Compilation error: {}", e))
+                })?
+                .load()
+                .map_err(|e| {
+                    exceptions::PyValueError::new_err(format!("Library loading error: {}", e))
+                })?,
+            output_len: self.eval.get_output_len(),
+        })
+    }
 }
 
 #[derive(FromPyObject)]
diff --git a/src/evaluate.rs b/src/evaluate.rs
index 60d220a..0e4bc5c 100644
--- a/src/evaluate.rs
+++ b/src/evaluate.rs
@@ -14,11 +14,14 @@ use self_cell::self_cell;
 use crate::{
     atom::{representation::InlineVar, Atom, AtomOrView, AtomView, Symbol},
     coefficient::CoefficientView,
+    combinatorics::unique_permutations,
     domains::{
         float::{Complex, NumericalFloatLike, Real},
+        integer::Integer,
         rational::Rational,
     },
     state::State,
+    LicenseManager,
 };
 
 type EvalFnType<T> = Box<
@@ -146,6 +149,7 @@ pub struct OptimizationSettings {
     pub n_cores: usize,
     pub cpe_iterations: Option<usize>,
     pub hot_start: Option<Vec<Expression<Rational>>>,
+    pub verbose: bool,
 }
 
 impl Default for OptimizationSettings {
@@ -155,6 +159,7 @@ impl Default for OptimizationSettings {
             n_cores: 1,
             cpe_iterations: None,
             hot_start: None,
+            verbose: false,
         }
     }
 }
@@ -188,7 +193,9 @@ impl Atom {
     }
 
     /// Create an efficient evaluator for a (nested) expression.
-    /// All variables and all user functions in the expression must occur in the map.
+    /// All free parameters must appear in `params` and all other variables
+    /// and user functions in the expression must occur in the function map.
+    /// The function map may have nested expressions.
     pub fn evaluator<'a>(
         &'a self,
         fn_map: &FunctionMap<'a, Rational>,
@@ -200,6 +207,7 @@ impl Atom {
             optimization_settings.horner_iterations,
             optimization_settings.n_cores,
             optimization_settings.hot_start.clone(),
+            optimization_settings.verbose,
         ))
     }
 
@@ -217,6 +225,7 @@ impl Atom {
             optimization_settings.horner_iterations,
             optimization_settings.n_cores,
             optimization_settings.hot_start.clone(),
+            optimization_settings.verbose,
         ))
     }
 }
@@ -643,13 +652,13 @@ pub struct ExpressionEvaluator<T> {
 }
 
 impl<T: Real> ExpressionEvaluator<T> {
-    pub fn evaluate(&mut self, params: &[T]) -> T {
+    pub fn evaluate_single(&mut self, params: &[T]) -> T {
         let mut res = T::new_zero();
-        self.evaluate_multiple(params, std::slice::from_mut(&mut res));
+        self.evaluate(params, std::slice::from_mut(&mut res));
         res
     }
 
-    pub fn evaluate_multiple(&mut self, params: &[T], out: &mut [T]) {
+    pub fn evaluate(&mut self, params: &[T], out: &mut [T]) {
         for (t, p) in self.stack.iter_mut().zip(params) {
             *t = p.clone();
         }
@@ -712,6 +721,10 @@ impl<T: Default> ExpressionEvaluator<T> {
         }
     }
 
+    pub fn get_output_len(&self) -> usize {
+        self.result_indices.len()
+    }
+
     fn remove_common_pairs(&mut self) -> usize {
         let mut pairs: HashMap<_, Vec<usize>> = HashMap::default();
 
@@ -1043,14 +1056,19 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
         include_header: bool,
         inline_asm: InlineASM,
     ) -> Result<ExportedCode, std::io::Error> {
+        let mut filename = filename.to_string();
+        if !filename.ends_with(".cpp") {
+            filename += ".cpp";
+        }
+
         let cpp = match inline_asm {
             InlineASM::Intel => self.export_asm_str(function_name, include_header),
             InlineASM::None => self.export_cpp_str(function_name, include_header),
         };
 
-        let _ = std::fs::write(filename, cpp)?;
+        let _ = std::fs::write(&filename, cpp)?;
         Ok(ExportedCode {
-            source_filename: filename.to_string(),
+            source_filename: filename,
             function_name: function_name.to_string(),
             inline_asm,
         })
@@ -1461,8 +1479,9 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
                         MemOrReg::Reg(out_reg) => {
                             if let Some(j) = a.iter().find(|x| **x == MemOrReg::Reg(*out_reg)) {
                                 // we can recycle the register completely
+                                let mut first_skipped = false;
                                 for i in a {
-                                    if i != j {
+                                    if first_skipped || i != j {
                                         match i {
                                             MemOrReg::Reg(k) => {
                                                 *out += &format!(
@@ -1480,14 +1499,16 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
                                             }
                                         }
                                     }
+                                    first_skipped |= i == j;
                                 }
                             } else if let Some(MemOrReg::Reg(j)) =
                                 a.iter().find(|x| matches!(x, MemOrReg::Reg(_)))
                             {
                                 *out += &format!("\t\t\"movapd xmm{}, xmm{}\\n\\t\"\n", out_reg, j);
 
+                                let mut first_skipped = false;
                                 for i in a {
-                                    if *i != MemOrReg::Reg(*j) {
+                                    if first_skipped || *i != MemOrReg::Reg(*j) {
                                         match i {
                                             MemOrReg::Reg(k) => {
                                                 *out += &format!(
@@ -1505,6 +1526,7 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
                                             }
                                         }
                                     }
+                                    first_skipped |= *i == MemOrReg::Reg(*j);
                                 }
                             } else {
                                 if let MemOrReg::Mem(k) = &a[0] {
@@ -1538,8 +1560,9 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
                                     *out +=
                                         &format!("\t\t\"movapd xmm{}, xmm{}\\n\\t\"\n", out_reg, j);
 
+                                    let mut first_skipped = false;
                                     for i in a {
-                                        if *i != MemOrReg::Reg(*j) {
+                                        if first_skipped || *i != MemOrReg::Reg(*j) {
                                             match i {
                                                 MemOrReg::Reg(k) => {
                                                     *out += &format!(
@@ -1557,6 +1580,8 @@ impl<T: std::fmt::Display> ExpressionEvaluator<T> {
                                                 }
                                             }
                                         }
+
+                                        first_skipped |= *i == MemOrReg::Reg(*j);
                                     }
                                 } else {
                                     if let MemOrReg::Mem(k) = &a[0] {
@@ -2299,8 +2324,9 @@ impl EvalTree<Rational> {
         iterations: usize,
         n_cores: usize,
         start_scheme: Option<Vec<Expression<Rational>>>,
+        verbose: bool,
     ) -> ExpressionEvaluator<Rational> {
-        let _ = self.optimize_horner_scheme(iterations, n_cores, start_scheme);
+        let _ = self.optimize_horner_scheme(iterations, n_cores, start_scheme, verbose);
         self.common_subexpression_elimination();
         self.clone().linearize(None)
     }
@@ -2335,6 +2361,7 @@ impl EvalTree<Rational> {
         iterations: usize,
         n_cores: usize,
         start_scheme: Option<Vec<Expression<Rational>>>,
+        verbose: bool,
     ) -> Vec<Expression<Rational>> {
         let v = match start_scheme {
             Some(a) => a,
@@ -2363,6 +2390,7 @@ impl EvalTree<Rational> {
             &v,
             iterations,
             n_cores,
+            verbose,
         );
         for e in &mut self.expressions.tree {
             e.apply_horner_scheme(&scheme);
@@ -2384,8 +2412,9 @@ impl EvalTree<Rational> {
             v.sort_by_key(|k| std::cmp::Reverse(k.1));
             let v = v.into_iter().map(|(k, _)| k).collect::<Vec<_>>();
 
-            let scheme =
-                Expression::optimize_horner_scheme_multiple(&e.tree, &v, iterations, n_cores);
+            let scheme = Expression::optimize_horner_scheme_multiple(
+                &e.tree, &v, iterations, n_cores, verbose,
+            );
 
             for t in &mut e.tree {
                 t.apply_horner_scheme(&scheme);
@@ -2660,8 +2689,15 @@ impl Expression<Rational> {
         vars: &[Self],
         iterations: usize,
         n_cores: usize,
+        verbose: bool,
     ) -> Vec<Self> {
-        Self::optimize_horner_scheme_multiple(std::slice::from_ref(self), vars, iterations, n_cores)
+        Self::optimize_horner_scheme_multiple(
+            std::slice::from_ref(self),
+            vars,
+            iterations,
+            n_cores,
+            verbose,
+        )
     }
 
     pub fn optimize_horner_scheme_multiple(
@@ -2669,6 +2705,7 @@ impl Expression<Rational> {
         vars: &[Self],
         iterations: usize,
         n_cores: usize,
+        verbose: bool,
     ) -> Vec<Self> {
         if vars.len() == 0 {
             return vars.to_vec();
@@ -2689,33 +2726,65 @@ impl Expression<Rational> {
             best_ops = (best_ops.0 + ops.0, best_ops.1 + ops.1);
         }
 
-        println!("init {:?} {:?} {}", best_ops, vars, vars.len());
+        if verbose {
+            println!(
+                "Initial ops: {} additions and {} multiplications",
+                best_ops.0, best_ops.1
+            );
+        }
 
         let best_mul = Arc::new(AtomicUsize::new(best_ops.1));
         let best_add = Arc::new(AtomicUsize::new(best_ops.0));
         let best_scheme = Arc::new(Mutex::new(vars.to_vec()));
 
+        let permutations =
+            if vars.len() < 10 && Integer::factorial(vars.len() as u32) <= iterations.max(1) {
+                let v: Vec<_> = (0..vars.len()).collect();
+                Some(unique_permutations(&v).1)
+            } else {
+                None
+            };
+        let p_ref = &permutations;
+
+        let n_cores = if LicenseManager::is_licensed() {
+            n_cores
+        } else {
+            1
+        };
+
         std::thread::scope(|s| {
-            for _ in 0..n_cores {
-                let mut vars = vars.to_vec();
+            for i in 0..n_cores {
+                let mut cvars = vars.to_vec();
                 let best_scheme = best_scheme.clone();
                 let best_mul = best_mul.clone();
                 let best_add = best_add.clone();
                 s.spawn(move || {
                     let mut r = thread_rng();
 
-                    for _ in 0..iterations / n_cores {
+                    for j in 0..iterations / n_cores {
                         // try a random swap
-                        let t1 = r.gen_range(0..vars.len());
-                        let t2 = r.gen_range(0..vars.len());
+                        let mut t1 = 0;
+                        let mut t2 = 0;
+
+                        if let Some(p) = p_ref {
+                            if j >= p.len() / n_cores {
+                                break;
+                            }
+
+                            let perm = &p[i * (p.len() / n_cores) + j];
+                            cvars = perm.iter().map(|x| vars[*x].clone()).collect();
+                        } else {
+                            t1 = r.gen_range(0..cvars.len());
+                            t2 = r.gen_range(0..cvars.len() - 1);
 
-                        vars.swap(t1, t2);
+                            cvars.swap(t1, t2);
+                        }
 
                         let horner: Vec<_> = expressions
                             .iter()
                             .map(|x| {
                                 let mut h = x.clone();
-                                h.apply_horner_scheme(&vars);
+                                h.apply_horner_scheme(&cvars);
                                 h.to_hashed_expression().1
                             })
                             .collect();
@@ -2732,13 +2801,19 @@ impl Expression<Rational> {
                             || cur_ops.1 == best_mul.load(Ordering::Relaxed)
                                 && cur_ops.0 <= best_add.load(Ordering::Relaxed)
                         {
-                            println!("new best {:?}", cur_ops);
+                            if verbose {
+                                println!(
+                                    "Accept move at core iteration {}/{}: {} additions and {} multiplications",
+                                    j, iterations / n_cores,
+                                    cur_ops.0, cur_ops.1
+                                );
+                            }
 
                             best_mul.store(cur_ops.1, Ordering::Relaxed);
                             best_add.store(cur_ops.0, Ordering::Relaxed);
-                            best_scheme.lock().unwrap().clone_from_slice(&vars);
+                            best_scheme.lock().unwrap().clone_from_slice(&cvars);
                         } else {
-                            vars.swap(t1, t2);
+                            cvars.swap(t1, t2);
                         }
                     }
                 });
@@ -3250,19 +3325,19 @@ impl Clone for CompiledEvaluator {
 
 /// A floating point type that can be used for compiled evaluation.
 pub trait CompiledEvaluatorFloat: Sized {
-    fn evaluate(eval: &CompiledEvaluator, args: &[Self], out: &mut [Self]);
+    fn evaluate(eval: &mut CompiledEvaluator, args: &[Self], out: &mut [Self]);
 }
 
 impl CompiledEvaluatorFloat for f64 {
     #[inline(always)]
-    fn evaluate(eval: &CompiledEvaluator, args: &[Self], out: &mut [Self]) {
+    fn evaluate(eval: &mut CompiledEvaluator, args: &[Self], out: &mut [Self]) {
         eval.evaluate_double(args, out);
     }
 }
 
 impl CompiledEvaluatorFloat for Complex<f64> {
     #[inline(always)]
-    fn evaluate(eval: &CompiledEvaluator, args: &[Self], out: &mut [Self]) {
+    fn evaluate(eval: &mut CompiledEvaluator, args: &[Self], out: &mut [Self]) {
         eval.evaluate_complex(args, out);
     }
 }
@@ -3356,13 +3431,13 @@ impl CompiledEvaluator {
 
     /// Evaluate the compiled code.
     #[inline(always)]
-    pub fn evaluate<T: CompiledEvaluatorFloat>(&self, args: &[T], out: &mut [T]) {
+    pub fn evaluate<T: CompiledEvaluatorFloat>(&mut self, args: &[T], out: &mut [T]) {
         T::evaluate(self, args, out);
     }
 
     /// Evaluate the compiled code with double-precision floating point numbers.
     #[inline(always)]
-    pub fn evaluate_double(&self, args: &[f64], out: &mut [f64]) {
+    pub fn evaluate_double(&mut self, args: &[f64], out: &mut [f64]) {
         unsafe {
             (self.library.borrow_dependent().eval_double)(
                 args.as_ptr(),
@@ -3374,7 +3449,7 @@ impl CompiledEvaluator {
 
     /// Evaluate the compiled code with complex numbers.
     #[inline(always)]
-    pub fn evaluate_complex(&self, args: &[Complex<f64>], out: &mut [Complex<f64>]) {
+    pub fn evaluate_complex(&mut self, args: &[Complex<f64>], out: &mut [Complex<f64>]) {
         unsafe {
             (self.library.borrow_dependent().eval_complex)(
                 args.as_ptr(),
diff --git a/src/id.rs b/src/id.rs
index 919c359..df6347f 100644
--- a/src/id.rs
+++ b/src/id.rs
@@ -821,7 +821,7 @@ impl Pattern {
         match self {
             Pattern::Wildcard(name) => {
                 if let Some(w) = match_stack.get(*name) {
-                    w.to_atom(out);
+                    w.to_atom_into(out);
                 } else if match_stack.settings.allow_new_wildcards_on_rhs {
                     out.to_var(*name);
                 } else {
@@ -863,7 +863,7 @@ impl Pattern {
                                     }
                                     _ => {
                                         let mut handle = workspace.new_atom();
-                                        w.to_atom(&mut handle);
+                                        w.to_atom_into(&mut handle);
                                         func.add_arg(handle.as_view())
                                     }
                                 },
@@ -902,7 +902,7 @@ impl Pattern {
                                 Match::Single(s) => out.set_from_view(s),
                                 Match::Multiple(_, _) => {
                                     let mut handle = workspace.new_atom();
-                                    w.to_atom(&mut handle);
+                                    w.to_atom_into(&mut handle);
                                     out.set_from_view(&handle.as_view())
                                 }
                                 Match::FunctionName(_) => {
@@ -947,7 +947,7 @@ impl Pattern {
                                     }
                                     _ => {
                                         let mut handle = workspace.new_atom();
-                                        w.to_atom(&mut handle);
+                                        w.to_atom_into(&mut handle);
                                         mul.extend(handle.as_view())
                                     }
                                 },
@@ -990,7 +990,7 @@ impl Pattern {
                                     }
                                     _ => {
                                         let mut handle = workspace.new_atom();
-                                        w.to_atom(&mut handle);
+                                        w.to_atom_into(&mut handle);
                                         add.extend(handle.as_view())
                                     }
                                 },
@@ -1488,7 +1488,15 @@ impl<'a> std::fmt::Debug for Match<'a> {
 impl<'a> Match<'a> {
     /// Create a new atom from a matched subexpression.
     /// Arguments lists are wrapped in the function `arg`.
-    pub fn to_atom(&self, out: &mut Atom) {
+    pub fn to_atom(&self) -> Atom {
+        let mut out = Atom::default();
+        self.to_atom_into(&mut out);
+        out
+    }
+
+    /// Create a new atom from a matched subexpression.
+    /// Arguments lists are wrapped in the function `arg`.
+    pub fn to_atom_into(&self, out: &mut Atom) {
         match self {
             Self::Single(v) => {
                 out.set_from_view(v);
diff --git a/symbolica.pyi b/symbolica.pyi
index 7a239a8..987a20e 100644
--- a/symbolica.pyi
+++ b/symbolica.pyi
@@ -1079,6 +1079,62 @@ class Expression:
         >>> print(e.evaluate_complex({x: 1 + 2j, y: 4 + 3j}, {}))
         """
 
+    def evaluator(
+        self,
+        constants: dict[Expression, Expression],
+        funs: dict[Tuple[Expression, str, Sequence[Expression]], Expression],
+        params: Sequence[Expression],
+        iterations: int = 100,
+        n_cores: int = 4,
+        verbose: bool = False,
+    ) -> Evaluator:
+        """Create an evaluator that can evaluate (nested) expressions in an optimized fashion.
+        All constants and functions should be provided as dictionaries, where the function
+        dictionary has a key `(name, printable name, arguments)` and the value is the function
+        body. For example the function `f(x,y)=x^2+y` should be provided as
+        `{(f, "f", (x, y)): x**2 + y}`. All free parameters should be provided in the `params` list.
+
+        Examples
+        --------
+        >>> from symbolica import *
+        >>> x, y, z, pi, f, g = Expression.symbols(
+        >>>     'x', 'y', 'z', 'pi', 'f', 'g')
+        >>>
+        >>> e1 = Expression.parse("x + pi + cos(x) + f(g(x+1),x*2)")
+        >>> fd = Expression.parse("y^2 + z^2*y^2")
+        >>> gd = Expression.parse("y + 5")
+        >>>
+        >>> ev = e1.evaluator({pi: Expression.num(22)/7},
+        >>>              {(f, "f", (y, z)): fd, (g, "g", (y, )): gd}, [x])
+        >>> res = ev.evaluate([[1.], [2.], [3.]])  # evaluate at x=1, x=2, x=3
+        >>> print(res)
+        """
+
+    @classmethod
+    def evaluator_multiple(
+        _cls,
+        exprs: Sequence[Expression],
+        constants: dict[Expression, Expression],
+        funs: dict[Tuple[Expression, str, Sequence[Expression]], Expression],
+        params: Sequence[Expression],
+        iterations: int = 100,
+        n_cores: int = 4,
+        verbose: bool = False,
+    ) -> Evaluator:
+        """Create an evaluator that can jointly evaluate (nested) expressions in an optimized fashion.
+        See `Expression.evaluator()` for more information.
+
+        Examples
+        --------
+        >>> from symbolica import *
+        >>> x = Expression.symbols('x')
+        >>> e1 = Expression.parse("x^2 + 1")
+        >>> e2 = Expression.parse("x^2 + 2)
+        >>> ev = Expression.evaluator_multiple([e1, e2], {}, {}, [x])
+
+        will recycle the `x^2`
+        """
+
 
 class Replacement:
     """A replacement of a pattern by a right-hand side."""
@@ -1810,14 +1866,6 @@ class Polynomial:
     def to_finite_field(self, prime: int) -> FiniteFieldPolynomial:
         """Convert the coefficients of the polynomial to a finite field with prime `prime`."""
 
-    def optimize(self, iterations: int = 1000, to_file: str | None = None) -> Evaluator:
-        """
-        Optimize the polynomial for evaluation using `iterations` number of iterations.
-        The optimized output can be exported in a C++ format using `to_file`.
-
-        Returns an evaluator for the polynomial.
-        """
-
     def factor_square_free(self) -> list[Tuple[Polynomial, int]]:
         """Compute the square-free factorization of the polynomial.
 
@@ -2790,8 +2838,58 @@ class Matrix:
 class Evaluator:
     """An optimized evaluator of an expression."""
 
-    def evaluate(self, inputs: Sequence[Sequence[float]]) -> List[float]:
-        """Evaluate the polynomial for multiple inputs and return the result."""
+    def compile(
+        self,
+        function_name: str,
+        filename: str,
+        library_name: str,
+        inline_asm: bool = True,
+        optimization_level: int = 3,
+        compiler_path: Optional[str] = None,
+    ) -> CompiledEvaluator:
+        """Compile the evaluator to a shared library using C++ and optionally inline assembly and load it."""
+
+    def evaluate(self, inputs: Sequence[Sequence[float]]) -> List[List[float]]:
+        """Evaluate the expression for multiple inputs and return the result."""
+
+    def evaluate_single(self, inputs: Sequence[Sequence[float]]) -> List[float]:
+        """Evaluate the expression for multiple inputs and return the result, which
+        is a single value."""
+
+    def evaluate_complex(self, inputs: Sequence[Sequence[complex]]) -> List[List[complex]]:
+        """Evaluate the expression for multiple inputs and return the result."""
+
+    def evaluate_complex_single(self, inputs: Sequence[Sequence[complex]]) -> List[complex]:
+        """Evaluate the expression for multiple inputs and return the result, which
+        is a single value."""
+
+
+class CompiledEvaluator:
+    """An compiled evaluator of an expression. This will give the highest performance of 
+    all evaluators."""
+
+    @classmethod
+    def load(
+        _cls,
+        filename: str,
+        function_name: str,
+        output_len: int,
+    ) -> CompiledEvaluator:
+        """Load a compiled library, previously generated with `Evaluator.compile()`."""
+
+    def evaluate(self, inputs: Sequence[Sequence[float]]) -> List[List[float]]:
+        """Evaluate the expression for multiple inputs and return the result."""
+
+    def evaluate_single(self, inputs: Sequence[Sequence[float]]) -> List[float]:
+        """Evaluate the expression for multiple inputs and return the result, which
+        is a single value."""
+
+    def evaluate_complex(self, inputs: Sequence[Sequence[complex]]) -> List[List[complex]]:
+        """Evaluate the expression for multiple inputs and return the result."""
+
+    def evaluate_complex_single(self, inputs: Sequence[Sequence[complex]]) -> List[complex]:
+        """Evaluate the expression for multiple inputs and return the result, which
+        is a single value."""
 
 
 class NumericalIntegrator: