apache · alamb · Apr 5, 2024 · Apr 3, 2024 · Apr 3, 2024 · Apr 3, 2024
diff --git a/datafusion-examples/examples/advanced_udaf.rs b/datafusion-examples/examples/advanced_udaf.rs
@@ -87,7 +87,18 @@ impl AggregateUDFImpl for GeoMeanUdaf {
     /// is supported, DataFusion will use this row oriented
     /// accumulator when the aggregate function is used as a window function
     /// or when there are only aggregates (no GROUP BY columns) in the plan.
-    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        // Error if IGNORE NULLs and ORDER BY are specified in the query as this
+        // UDAF does not support them.
+        //
+        // For example `SELECT geo_mean(a) IGNORE NULLS` and `SELECT geo_mean(a)
+        // ORDER BY b` would fail.
+        //
+        // If your Accumulator supports different behavior for these options,
+        // you can implement it here.
+        acc_args.check_ignore_nulls(self.name())?;
+        acc_args.check_order_by(self.name())?;
+
         Ok(Box::new(GeometricMean::new()))
     }
 

diff --git a/datafusion/core/tests/user_defined/user_defined_aggregates.rs b/datafusion/core/tests/user_defined/user_defined_aggregates.rs
@@ -20,7 +20,8 @@
 
 use arrow::{array::AsArray, datatypes::Fields};
 use arrow_array::{types::UInt64Type, Int32Array, PrimitiveArray, StructArray};
-use arrow_schema::Schema;
+use arrow_schema::{Schema, SchemaRef};
+use std::any::Any;
 use std::sync::{
     atomic::{AtomicBool, Ordering},
     Arc,
@@ -48,7 +49,7 @@ use datafusion_expr::{
     create_udaf, function::AccumulatorArgs, AggregateUDFImpl, GroupsAccumulator,
     SimpleAggregateUDF,
 };
-use datafusion_physical_expr::expressions::AvgAccumulator;
+use datafusion_physical_expr::expressions::{AvgAccumulator, MinAccumulator};
 
 /// Test to show the contents of the setup
 #[tokio::test]
@@ -210,25 +211,33 @@ async fn execute(ctx: &SessionContext, sql: &str) -> Result<Vec<RecordBatch>> {
     ctx.sql(sql).await?.collect().await
 }
 
-/// tests the creation, registration and usage of a UDAF
-#[tokio::test]
-async fn simple_udaf() -> Result<()> {
-    let schema = Schema::new(vec![Field::new("a", DataType::Int32, false)]);
+/// Return a SessionContext with a basic table "t"
+fn simple_udf_context() -> Result<SessionContext> {
+    let schema: SchemaRef =
+        Arc::new(Schema::new(vec![Field::new("a", DataType::Int32, false)]));
 
     let batch1 = RecordBatch::try_new(
-        Arc::new(schema.clone()),
+        schema.clone(),
         vec![Arc::new(Int32Array::from(vec![1, 2, 3]))],
     )?;
     let batch2 = RecordBatch::try_new(
-        Arc::new(schema.clone()),
+        schema.clone(),
         vec![Arc::new(Int32Array::from(vec![4, 5]))],
     )?;
 
     let ctx = SessionContext::new();
 
-    let provider = MemTable::try_new(Arc::new(schema), vec![vec![batch1], vec![batch2]])?;
+    let provider = MemTable::try_new(schema, vec![vec![batch1], vec![batch2]])?;
     ctx.register_table("t", Arc::new(provider))?;
 
+    Ok(ctx)
+}
+
+/// tests the creation, registration and usage of a UDAF
+#[tokio::test]
+async fn simple_udaf() -> Result<()> {
+    let ctx = simple_udf_context()?;
+
     // define a udaf, using a DataFusion's accumulator
     let my_avg = create_udaf(
         "my_avg",
@@ -255,6 +264,107 @@ async fn simple_udaf() -> Result<()> {
     Ok(())
 }
 
+/// tests the creation, registration and usage of a AggregateUDFImpl based aggregate
+#[tokio::test]
+async fn simple_udaf_trait() -> Result<()> {
+    let ctx = simple_udf_context()?;
+
+    // define a udaf, using a DataFusion's accumulator
+    ctx.register_udaf(AggregateUDF::from(MyMin::new()));
+
+    let result = ctx.sql("SELECT MY_MIN(a) FROM t").await?.collect().await?;
+
+    let expected = [
+        "+-------------+",
+        "| my_min(t.a) |",
+        "+-------------+",
+        "| 1.0         |",
+        "+-------------+",
+    ];
+    assert_batches_eq!(expected, &result);
+
+    Ok(())
+}
+
+/// Tests checking for syntax errors
+#[tokio::test]
+async fn simple_udaf_trait_ignore_nulls() -> Result<()> {
+    let ctx = simple_udf_context()?;
+    ctx.register_udaf(AggregateUDF::from(MyMin::new()));
+
+    // You can pass IGNORE NULLs to the UDAF
+    let err = ctx
+        .sql("SELECT MY_MIN(a) IGNORE NULLS FROM t")
+        .await?
+        .collect()
+        .await
+        .unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "This feature is not implemented: IGNORE NULLS not implemented for my_min"
+    );
+    // RESPECT NULLS should work (the default)
+    ctx.sql("SELECT MY_MIN(a) RESPECT NULLS FROM t")
+        .await?
+        .collect()
+        .await
+        .unwrap();
+
+    // You can pass ORDER BY to the UDAF as well, which should error if it isn't supported
+    let err = ctx
+        .sql("SELECT MY_MIN(a ORDER BY a) FROM t")
+        .await?
+        .collect()
+        .await
+        .unwrap_err();
+    assert_eq!(
+        err.to_string(),
+        "This feature is not implemented: ORDER BY not implemented for my_min"
+    );
+
+    Ok(())
+}
+
+#[derive(Debug)]
+struct MyMin {
+    signature: Signature,
+}
+
+impl MyMin {
+    fn new() -> Self {
+        Self {
+            signature: Signature::exact(vec![DataType::Float64], Volatility::Immutable),
+        }
+    }
+}
+impl AggregateUDFImpl for MyMin {
+    fn as_any(&self) -> &dyn Any {
+        self
+    }
+
+    fn name(&self) -> &str {
+        "my_min"
+    }
+
+    fn signature(&self) -> &Signature {
+        &self.signature
+    }
+
+    fn return_type(&self, _arg_types: &[DataType]) -> Result<DataType> {
+        Ok(DataType::Float64)
+    }
+
+    fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+        // Error if IGNORE NULLs and ORDER BY are specified
+        acc_args.check_ignore_nulls(self.name())?;
+        acc_args.check_order_by(self.name())?;
+
+        // Use MinAccumulator
+        MinAccumulator::try_new(&DataType::Float64)
+            .map(|acc| Box::new(acc) as Box<dyn Accumulator>)
+    }
+}
+
 #[tokio::test]
 async fn deregister_udaf() -> Result<()> {
     let ctx = SessionContext::new();
@@ -526,7 +636,6 @@ impl Accumulator for TimeSum {
         let arr = arr.as_primitive::<TimestampNanosecondType>();
 
         for v in arr.values().iter() {
-            println!("Adding {v}");
             self.sum += v;
         }
         Ok(())
@@ -538,7 +647,6 @@ impl Accumulator for TimeSum {
     }
 
     fn evaluate(&mut self) -> Result<ScalarValue> {
-        println!("Evaluating to {}", self.sum);
         Ok(ScalarValue::TimestampNanosecond(Some(self.sum), None))
     }
 
@@ -558,7 +666,6 @@ impl Accumulator for TimeSum {
         let arr = arr.as_primitive::<TimestampNanosecondType>();
 
         for v in arr.values().iter() {
-            println!("Retracting {v}");
             self.sum -= v;
         }
         Ok(())

diff --git a/datafusion/expr/src/function.rs b/datafusion/expr/src/function.rs
@@ -20,7 +20,7 @@
 use crate::ColumnarValue;
 use crate::{Accumulator, Expr, PartitionEvaluator};
 use arrow::datatypes::{DataType, Schema};
-use datafusion_common::Result;
+use datafusion_common::{not_impl_err, Result};
 use std::sync::Arc;
 
 /// Scalar function
@@ -38,18 +38,39 @@ pub type ScalarFunctionImplementation =
 pub type ReturnTypeFunction =
     Arc<dyn Fn(&[DataType]) -> Result<Arc<DataType>> + Send + Sync>;
 
-/// Arguments passed to create an accumulator
+/// [`AccumulatorArgs`] contains information about how an aggregate
+/// function was called, including the types of its arguments and any optional
+/// ordering expressions.
 pub struct AccumulatorArgs<'a> {
-    // default arguments
-    /// the return type of the function
+    /// The return type of the aggregate function.
     pub data_type: &'a DataType,
-    /// the schema of the input arguments
+    /// The schema of the input arguments
     pub schema: &'a Schema,
-    /// whether to ignore nulls
+    /// Whether to ignore nulls.
+    ///
+    /// SQL allows the user to specify `IGNORE NULLS`, for example:
+    ///
+    /// ```sql
+    /// SELECT FIRST_VALUE(column1) IGNORE NULLS FROM t;
+    /// ```
+    ///
+    /// Aggregates that do not support this functionality should return a not
+    /// implemented error when `ignore_nulls` is true.
     pub ignore_nulls: bool,
 
-    // ordering arguments
-    /// the expressions of `order by`, if no ordering is required, this will be an empty slice
+    /// The expressions in the `ORDER BY` clause passed to this aggregator.
+    ///
+    /// SQL allows the user to specify the ordering of arguments to the
+    /// aggregate using an `ORDER BY`. For example:
+    ///
+    /// ```sql
+    /// SELECT FIRST_VALUE(column1 ORDER BY column2) FROM t;
+    /// ```
+    ///
+    /// If no `ORDER BY` is specified, `sort_exprs`` will be empty. Aggregates
+    /// that do not support this functionality may return a not implemented
+    /// error when the slice is non empty, as ordering the arguments is an
+    /// expensive operation and is wasteful if the aggregate doesn't support it.
     pub sort_exprs: &'a [Expr],
 }
 
@@ -67,6 +88,24 @@ impl<'a> AccumulatorArgs<'a> {
             sort_exprs,
         }
     }
+
+    /// Return a not yet implemented error if IGNORE NULLs is true
+    pub fn check_ignore_nulls(&self, name: &str) -> Result<()> {
+        if self.ignore_nulls {
+            not_impl_err!("IGNORE NULLS not implemented for {name}")
+        } else {
+            Ok(())
+        }
+    }
+
+    /// Return a not yet implemented error if `ORDER BY` is non empty
+    pub fn check_order_by(&self, name: &str) -> Result<()> {
+        if !self.sort_exprs.is_empty() {
+            not_impl_err!("ORDER BY not implemented for {name}")
+        } else {
+            Ok(())
+        }
+    }
 }
 
 /// Factory that returns an accumulator for the given aggregate function.

diff --git a/datafusion/expr/src/udaf.rs b/datafusion/expr/src/udaf.rs
@@ -211,8 +211,8 @@ where
 /// See [`advanced_udaf.rs`] for a full example with complete implementation and
 /// [`AggregateUDF`] for other available options.
 ///
-///
 /// [`advanced_udaf.rs`]: https://github.com/apache/arrow-datafusion/blob/main/datafusion-examples/examples/advanced_udaf.rs
+///
 /// # Basic Example
 /// ```
 /// # use std::any::Any;
@@ -247,7 +247,12 @@ where
 ///      Ok(DataType::Float64)
 ///    }
 ///    // This is the accumulator factory; DataFusion uses it to create new accumulators.
-///    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> { unimplemented!() }
+///    fn accumulator(&self, _acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>> {
+///        // Error if IGNORE NULLs and ORDER BY are specified in the query
+//         acc_args.check_ignore_nulls(self.name())?;
+//         acc_args.check_order_by(self.name())?;
+///        unimplemented!()
+///     }
 ///    fn state_fields(&self, _name: &str, value_type: DataType, _ordering_fields: Vec<Field>) -> Result<Vec<Field>> {
 ///        Ok(vec![
 ///             Field::new("value", value_type, true),
@@ -280,7 +285,8 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     /// Return a new [`Accumulator`] that aggregates values for a specific
     /// group during query execution.
     ///
-    /// `acc_args`: the arguments to the accumulator. See [`AccumulatorArgs`] for more details.
+    /// acc_args: [`AccumulatorArgs`] contains information about how the
+    /// aggregate function was called.
     fn accumulator(&self, acc_args: AccumulatorArgs) -> Result<Box<dyn Accumulator>>;
 
     /// Return the fields of the intermediate state.
@@ -308,6 +314,13 @@ pub trait AggregateUDFImpl: Debug + Send + Sync {
     /// If the aggregate expression has a specialized
     /// [`GroupsAccumulator`] implementation. If this returns true,
     /// `[Self::create_groups_accumulator]` will be called.
+    ///
+    /// # Notes
+    ///
+    /// Even if this function returns true, DataFusion will call
+    /// `Self::accumulator` for certain queries, such as when this aggregate is
+    /// used as a window function or when there are only aggregates (no GROUP BY
+    /// columns) in the plan.
     fn groups_accumulator_supported(&self) -> bool {
         false
     }