diff --git a/datafusion/expr/src/accumulator.rs b/datafusion/expr/src/accumulator.rs index 523e4e21a640..fa2017586d21 100644 --- a/datafusion/expr/src/accumulator.rs +++ b/datafusion/expr/src/accumulator.rs @@ -21,11 +21,13 @@ use arrow::array::ArrayRef; use datafusion_common::{internal_err, DataFusionError, Result, ScalarValue}; use std::fmt::Debug; -/// Describes an aggregate functions's state. +/// Tracks an aggregate function's state. /// -/// `Accumulator`s are stateful objects that live throughout the -/// evaluation of multiple rows and aggregate multiple values together -/// into a final output aggregate. +/// `Accumulator`s are stateful objects that implement a single group. They +/// aggregate values from multiple rows together into a final output aggregate. +/// +/// [`GroupsAccumulator]` is an additional more performant (but also complex) API +/// that manages state for multiple groups at once. /// /// An accumulator knows how to: /// * update its state from inputs via [`update_batch`] @@ -40,6 +42,7 @@ use std::fmt::Debug; /// [`state`] and combine the state from multiple accumulators' /// via [`merge_batch`], as part of efficient multi-phase grouping. /// +/// [`GroupsAccumulator`]: crate::GroupsAccumulator /// [`update_batch`]: Self::update_batch /// [`retract_batch`]: Self::retract_batch /// [`state`]: Self::state diff --git a/datafusion/expr/src/groups_accumulator.rs b/datafusion/expr/src/groups_accumulator.rs index 6580de19bc68..2ffbfb266e9c 100644 --- a/datafusion/expr/src/groups_accumulator.rs +++ b/datafusion/expr/src/groups_accumulator.rs @@ -59,12 +59,24 @@ impl EmitTo { /// `GroupAccumulator` implements a single aggregate (e.g. AVG) and /// stores the state for *all* groups internally. /// +/// # Notes on Implementing `GroupAccumulator` +/// +/// All aggregates must first implement the simpler [`Accumulator`] trait, which +/// handles state for a single group. Implementing `GroupsAccumulator` is +/// optional and is harder to implement than `Accumulator`, but can be much +/// faster for queries with many group values. See the [Aggregating Millions of +/// Groups Fast blog] for more background. +/// +/// # Details /// Each group is assigned a `group_index` by the hash table and each -/// accumulator manages the specific state, one per group_index. +/// accumulator manages the specific state, one per `group_index`. /// -/// group_indexes are contiguous (there aren't gaps), and thus it is -/// expected that each GroupAccumulator will use something like `Vec<..>` +/// `group_index`es are contiguous (there aren't gaps), and thus it is +/// expected that each `GroupAccumulator` will use something like `Vec<..>` /// to store the group states. +/// +/// [`Accumulator`]: crate::Accumulator +/// [Aggregating Millions of Groups Fast blog]: https://arrow.apache.org/blog/2023/08/05/datafusion_fast_grouping/ pub trait GroupsAccumulator: Send { /// Updates the accumulator's state from its arguments, encoded as /// a vector of [`ArrayRef`]s. diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs index 444a4f1e8099..fb062e5830eb 100644 --- a/datafusion/expr/src/udaf.rs +++ b/datafusion/expr/src/udaf.rs @@ -37,12 +37,12 @@ use std::sync::Arc; /// functions (`GROUP BY` clause) as well as window functions (`OVER` /// clause). /// -/// `AggregateUDF` provides DataFusion the information needed to plan -/// and call aggregate functions, including name, type information, -/// and a factory function to create [`Accumulator`], which peform the -/// actual aggregation. +/// `AggregateUDF` provides DataFusion the information needed to plan and call +/// aggregate functions, including name, type information, and a factory +/// function to create an [`Accumulator`] instance, to perform the actual +/// aggregation. /// -/// For more information, please see [the examples]. +/// For more information, please see [the examples]: /// /// 1. For simple (less performant) use cases, use [`create_udaf`] and [`simple_udaf.rs`]. /// @@ -58,7 +58,6 @@ use std::sync::Arc; /// [`create_udaf`]: crate::expr_fn::create_udaf /// [`simple_udaf.rs`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/simple_udaf.rs /// [`advanced_udaf.rs`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs - #[derive(Debug, Clone)] pub struct AggregateUDF { inner: Arc, @@ -153,7 +152,7 @@ impl AggregateUDF { self.inner.return_type(args) } - /// Return an accumualator the given aggregate, given + /// Return an accumulator the given aggregate, given /// its return datatype. pub fn accumulator(&self, return_type: &DataType) -> Result> { self.inner.accumulator(return_type)