diff --git a/datafusion/core/src/bin/print_functions_docs.rs b/datafusion/core/src/bin/print_functions_docs.rs index 7f3990c534278..355a971114e26 100644 --- a/datafusion/core/src/bin/print_functions_docs.rs +++ b/datafusion/core/src/bin/print_functions_docs.rs @@ -158,6 +158,23 @@ fn print_docs( unreachable!() }; + // let attr_text = documentation.to_doc_attribute(); + // + // let file_path = format!("{}.txt", name); + // if std::path::Path::new(&file_path).exists() { + // std::fs::remove_file(&file_path).unwrap(); + // } + // + // // Open the file in append mode, create it if it doesn't exist + // let mut file = std::fs::OpenOptions::new() + // .append(true) // Open in append mode + // .create(true) // Create the file if it doesn't exist + // .open(file_path) + // .unwrap(); + // + // use std::io::Write; + // file.write_all(attr_text.as_bytes()).unwrap(); + // first, the name, description and syntax example let _ = write!( docs, diff --git a/datafusion/doc/src/lib.rs b/datafusion/doc/src/lib.rs index 5bc986d07f8ed..a7a1474a64954 100644 --- a/datafusion/doc/src/lib.rs +++ b/datafusion/doc/src/lib.rs @@ -63,6 +63,88 @@ impl Documentation { ) -> DocumentationBuilder { DocumentationBuilder::new(doc_section, description, syntax_example) } + + /// Output the `Documentation` struct in form of custom Rust documentation attributes + pub fn to_doc_attribute(&self) -> String { + let mut result = String::new(); + + result.push_str("#[user_doc("); + // Doc Section + result.push_str( + format!( + "\n doc_section({}label = \"{}\"{}),", + if !self.doc_section.include { + "include = \"false\", " + } else { + "" + }, + self.doc_section.label, + self.doc_section + .description + .map(|s| format!(", description = \"{}\"", s)) + .unwrap_or_default(), + ) + .as_ref(), + ); + + // Description + result.push_str(format!("\n description=\"{}\",", self.description).as_ref()); + // Syntax Example + result.push_str( + format!("\n syntax_example=\"{}\",", self.syntax_example).as_ref(), + ); + // SQL Example + result.push_str( + &self + .sql_example + .clone() + .map(|s| format!("\n sql_example = r#\"{}\"#,", s)) + .unwrap_or_default(), + ); + + let st_arg_token = " expression to operate on. Can be a constant, column, or function, and any combination of operators."; + // Standard Arguments + if let Some(args) = self.arguments.clone() { + args.iter().for_each(|(name, value)| { + if value.contains(st_arg_token) { + if name.starts_with("The ") { + result.push_str(format!("\n standard_argument(\n name = \"{}\"),", name).as_ref()); + } else { + result.push_str(format!("\n standard_argument(\n name = \"{}\",\n prefix = \"{}\"\n ),", name, value.replace(st_arg_token, "")).as_ref()); + } + } + }); + } + + // Arguments + if let Some(args) = self.arguments.clone() { + args.iter().for_each(|(name, value)| { + if !value.contains(st_arg_token) { + result.push_str(format!("\n argument(\n name = \"{}\",\n description = \"{}\"\n ),", name, value).as_ref()); + } + }); + } + + if let Some(alt_syntax) = self.alternative_syntax.clone() { + alt_syntax.iter().for_each(|syntax| { + result.push_str( + format!("\n alternative_syntax = \"{}\",", syntax).as_ref(), + ); + }); + } + + // Related UDFs + if let Some(related_udf) = self.related_udfs.clone() { + related_udf.iter().for_each(|udf| { + result + .push_str(format!("\n related_udf(name = \"{}\"),", udf).as_ref()); + }); + } + + result.push_str("\n)]"); + + result + } } #[derive(Debug, Clone, PartialEq)] diff --git a/datafusion/functions-aggregate/Cargo.toml b/datafusion/functions-aggregate/Cargo.toml index 33a52afbe21a8..20197630b3182 100644 --- a/datafusion/functions-aggregate/Cargo.toml +++ b/datafusion/functions-aggregate/Cargo.toml @@ -42,8 +42,10 @@ ahash = { workspace = true } arrow = { workspace = true } arrow-schema = { workspace = true } datafusion-common = { workspace = true } +datafusion-doc = { workspace = true } datafusion-execution = { workspace = true } datafusion-expr = { workspace = true } +datafusion-macros = { workspace = true } datafusion-functions-aggregate-common = { workspace = true } datafusion-physical-expr = { workspace = true } datafusion-physical-expr-common = { workspace = true } diff --git a/datafusion/functions-aggregate/src/approx_distinct.rs b/datafusion/functions-aggregate/src/approx_distinct.rs index e955cea9d1a08..74691ba740fdd 100644 --- a/datafusion/functions-aggregate/src/approx_distinct.rs +++ b/datafusion/functions-aggregate/src/approx_distinct.rs @@ -31,17 +31,19 @@ use datafusion_common::ScalarValue; use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; +use datafusion_macros::user_doc; use std::any::Any; use std::fmt::{Debug, Formatter}; use std::hash::Hash; use std::marker::PhantomData; use std::sync::OnceLock; + make_udaf_expr_and_func!( ApproxDistinct, approx_distinct, @@ -243,6 +245,20 @@ impl Default for ApproxDistinct { } } +#[user_doc( + doc_section(label = "Approximate Functions"), + description = "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", + syntax_example = "approx_distinct(expression)", + sql_example = r#"```sql +> SELECT approx_distinct(column_name) FROM table_name; ++-----------------------------------+ +| approx_distinct(column_name) | ++-----------------------------------+ +| 42 | ++-----------------------------------+ +```"#, + standard_argument(name = "expression",) +)] pub struct ApproxDistinct { signature: Signature, } @@ -309,25 +325,6 @@ impl AggregateUDFImpl for ApproxDistinct { } fn documentation(&self) -> Option<&Documentation> { - Some(get_approx_distinct_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_approx_distinct_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder(DOC_SECTION_APPROXIMATE, "Returns the approximate number of distinct input values calculated using the HyperLogLog algorithm.", "approx_distinct(expression)") - .with_sql_example(r#"```sql -> SELECT approx_distinct(column_name) FROM table_name; -+-----------------------------------+ -| approx_distinct(column_name) | -+-----------------------------------+ -| 42 | -+-----------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} diff --git a/datafusion/functions-aggregate/src/approx_median.rs b/datafusion/functions-aggregate/src/approx_median.rs index 8920c8e5f0c48..d6486c296785c 100644 --- a/datafusion/functions-aggregate/src/approx_median.rs +++ b/datafusion/functions-aggregate/src/approx_median.rs @@ -25,13 +25,14 @@ use arrow::{datatypes::DataType, datatypes::Field}; use arrow_schema::DataType::{Float64, UInt64}; use datafusion_common::{not_impl_err, plan_err, Result}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; +use datafusion_macros::user_doc; use crate::approx_percentile_cont::ApproxPercentileAccumulator; @@ -44,6 +45,20 @@ make_udaf_expr_and_func!( ); /// APPROX_MEDIAN aggregate expression +#[user_doc( + doc_section(label = "Approximate Functions"), + description = "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.", + syntax_example = "approx_median(expression)", + sql_example = r#"```sql +> SELECT approx_median(column_name) FROM table_name; ++-----------------------------------+ +| approx_median(column_name) | ++-----------------------------------+ +| 23.5 | ++-----------------------------------+ +```"#, + standard_argument(name = "expression",) +)] pub struct ApproxMedian { signature: Signature, } @@ -122,29 +137,6 @@ impl AggregateUDFImpl for ApproxMedian { } fn documentation(&self) -> Option<&Documentation> { - Some(get_approx_median_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_approx_median_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_APPROXIMATE, - "Returns the approximate median (50th percentile) of input values. It is an alias of `approx_percentile_cont(x, 0.5)`.", - - "approx_median(expression)") - .with_sql_example(r#"```sql -> SELECT approx_median(column_name) FROM table_name; -+-----------------------------------+ -| approx_median(column_name) | -+-----------------------------------+ -| 23.5 | -+-----------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont.rs b/datafusion/functions-aggregate/src/approx_percentile_cont.rs index 6edae6344ab15..b81d1717c2afb 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont.rs @@ -35,7 +35,7 @@ use datafusion_common::{ downcast_value, internal_err, not_impl_datafusion_err, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{INTEGERS, NUMERICS}; use datafusion_expr::utils::format_state_name; @@ -46,6 +46,7 @@ use datafusion_expr::{ use datafusion_functions_aggregate_common::tdigest::{ TDigest, TryIntoF64, DEFAULT_MAX_SIZE, }; +use datafusion_macros::user_doc; use datafusion_physical_expr_common::physical_expr::PhysicalExpr; create_func!(ApproxPercentileCont, approx_percentile_cont_udaf); @@ -64,6 +65,28 @@ pub fn approx_percentile_cont( approx_percentile_cont_udaf().call(args) } +#[user_doc( + doc_section(label = "Approximate Functions"), + description = "Returns the approximate percentile of input values using the t-digest algorithm.", + syntax_example = "approx_percentile_cont(expression, percentile, centroids)", + sql_example = r#"```sql +> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; ++-------------------------------------------------+ +| approx_percentile_cont(column_name, 0.75, 100) | ++-------------------------------------------------+ +| 65.0 | ++-------------------------------------------------+ +```"#, + standard_argument(name = "expression",), + argument( + name = "percentile", + description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)." + ), + argument( + name = "centroids", + description = "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory." + ) +)] pub struct ApproxPercentileCont { signature: Signature, } @@ -272,33 +295,10 @@ impl AggregateUDFImpl for ApproxPercentileCont { } fn documentation(&self) -> Option<&Documentation> { - Some(get_approx_percentile_cont_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_approx_percentile_cont_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_APPROXIMATE, - "Returns the approximate percentile of input values using the t-digest algorithm.", - "approx_percentile_cont(expression, percentile, centroids)") - .with_sql_example(r#"```sql -> SELECT approx_percentile_cont(column_name, 0.75, 100) FROM table_name; -+-------------------------------------------------+ -| approx_percentile_cont(column_name, 0.75, 100) | -+-------------------------------------------------+ -| 65.0 | -+-------------------------------------------------+ -```"#) - .with_standard_argument("expression", None) - .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") - .with_argument("centroids", "Number of centroids to use in the t-digest algorithm. _Default is 100_. A higher number results in more accurate approximation but requires more memory.") - .build() - }) -} - #[derive(Debug)] pub struct ApproxPercentileAccumulator { digest: TDigest, diff --git a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs index 7cf8d2dca13f0..485874aeb2841 100644 --- a/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs +++ b/datafusion/functions-aggregate/src/approx_percentile_cont_with_weight.rs @@ -27,7 +27,7 @@ use arrow::{ use datafusion_common::ScalarValue; use datafusion_common::{not_impl_err, plan_err, Result}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_APPROXIMATE; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::NUMERICS; use datafusion_expr::Volatility::Immutable; @@ -37,6 +37,7 @@ use datafusion_expr::{ use datafusion_functions_aggregate_common::tdigest::{ Centroid, TDigest, DEFAULT_MAX_SIZE, }; +use datafusion_macros::user_doc; use crate::approx_percentile_cont::{ApproxPercentileAccumulator, ApproxPercentileCont}; @@ -49,6 +50,28 @@ make_udaf_expr_and_func!( ); /// APPROX_PERCENTILE_CONT_WITH_WEIGHT aggregate expression +#[user_doc( + doc_section(label = "Approximate Functions"), + description = "Returns the weighted approximate percentile of input values using the t-digest algorithm.", + syntax_example = "approx_percentile_cont_with_weight(expression, weight, percentile)", + sql_example = r#"```sql +> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; ++----------------------------------------------------------------------+ +| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | ++----------------------------------------------------------------------+ +| 78.5 | ++----------------------------------------------------------------------+ +```"#, + standard_argument(name = "expression", prefix = "The"), + argument( + name = "weight", + description = "Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators." + ), + argument( + name = "percentile", + description = "Percentile to compute. Must be a float value between 0 and 1 (inclusive)." + ) +)] pub struct ApproxPercentileContWithWeight { signature: Signature, approx_percentile_cont: ApproxPercentileCont, @@ -157,35 +180,10 @@ impl AggregateUDFImpl for ApproxPercentileContWithWeight { } fn documentation(&self) -> Option<&Documentation> { - Some(get_approx_percentile_cont_with_weight_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_approx_percentile_cont_with_weight_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_APPROXIMATE, - "Returns the weighted approximate percentile of input values using the t-digest algorithm.", - - "approx_percentile_cont_with_weight(expression, weight, percentile)") - .with_sql_example(r#"```sql -> SELECT approx_percentile_cont_with_weight(column_name, weight_column, 0.90) FROM table_name; -+----------------------------------------------------------------------+ -| approx_percentile_cont_with_weight(column_name, weight_column, 0.90) | -+----------------------------------------------------------------------+ -| 78.5 | -+----------------------------------------------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .with_argument("weight", "Expression to use as weight. Can be a constant, column, or function, and any combination of arithmetic operators.") - .with_argument("percentile", "Percentile to compute. Must be a float value between 0 and 1 (inclusive).") - .build() - }) -} - #[derive(Debug)] pub struct ApproxPercentileWithWeightAccumulator { approx_percentile_cont_accumulator: ApproxPercentileAccumulator, diff --git a/datafusion/functions-aggregate/src/array_agg.rs b/datafusion/functions-aggregate/src/array_agg.rs index 3b9a521ec9721..4a7e263909622 100644 --- a/datafusion/functions-aggregate/src/array_agg.rs +++ b/datafusion/functions-aggregate/src/array_agg.rs @@ -25,13 +25,14 @@ use datafusion_common::cast::as_list_array; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, ScalarValue}; use datafusion_common::{internal_err, Result}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{Accumulator, Signature, Volatility}; use datafusion_expr::{AggregateUDFImpl, Documentation}; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; +use datafusion_macros::user_doc; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; use std::collections::{HashSet, VecDeque}; use std::mem::{size_of, size_of_val}; @@ -45,6 +46,20 @@ make_udaf_expr_and_func!( array_agg_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order.", + syntax_example = "array_agg(expression [ORDER BY expression])", + sql_example = r#"```sql +> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| array_agg(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| [element1, element2, element3] | ++-----------------------------------------------+ +```"#, + standard_argument(name = "expression",) +)] #[derive(Debug)] /// ARRAY_AGG aggregate expression pub struct ArrayAgg { @@ -146,33 +161,10 @@ impl AggregateUDFImpl for ArrayAgg { } fn documentation(&self) -> Option<&Documentation> { - Some(get_array_agg_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_array_agg_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns an array created from the expression elements. If ordering is required, elements are inserted in the specified order.", - - "array_agg(expression [ORDER BY expression])") - .with_sql_example(r#"```sql -> SELECT array_agg(column_name ORDER BY other_column) FROM table_name; -+-----------------------------------------------+ -| array_agg(column_name ORDER BY other_column) | -+-----------------------------------------------+ -| [element1, element2, element3] | -+-----------------------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug)] pub struct ArrayAggAccumulator { values: Vec, diff --git a/datafusion/functions-aggregate/src/average.rs b/datafusion/functions-aggregate/src/average.rs index 3fa58f3c2082e..65ca441517a0e 100644 --- a/datafusion/functions-aggregate/src/average.rs +++ b/datafusion/functions-aggregate/src/average.rs @@ -28,7 +28,6 @@ use arrow::datatypes::{ Float64Type, UInt64Type, }; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::type_coercion::aggregates::{avg_return_type, coerce_avg_type}; use datafusion_expr::utils::format_state_name; @@ -43,7 +42,9 @@ use datafusion_functions_aggregate_common::aggregate::groups_accumulator::nulls: filtered_null_mask, set_nulls, }; +use datafusion_doc::DocSection; use datafusion_functions_aggregate_common::utils::DecimalAverager; +use datafusion_macros::user_doc; use log::debug; use std::any::Any; use std::fmt::Debug; @@ -58,6 +59,20 @@ make_udaf_expr_and_func!( avg_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the average of numeric values in the specified column.", + syntax_example = "avg(expression)", + sql_example = r#"```sql +> SELECT avg(column_name) FROM table_name; ++---------------------------+ +| avg(column_name) | ++---------------------------+ +| 42.75 | ++---------------------------+ +```"#, + standard_argument(name = "expression",) +)] #[derive(Debug)] pub struct Avg { signature: Signature, @@ -240,34 +255,10 @@ impl AggregateUDFImpl for Avg { } fn documentation(&self) -> Option<&Documentation> { - Some(get_avg_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_avg_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the average of numeric values in the specified column.", - "avg(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT avg(column_name) FROM table_name; -+---------------------------+ -| avg(column_name) | -+---------------------------+ -| 42.75 | -+---------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - /// An accumulator to compute the average #[derive(Debug, Default)] pub struct AvgAccumulator { diff --git a/datafusion/functions-aggregate/src/bool_and_or.rs b/datafusion/functions-aggregate/src/bool_and_or.rs index df9271d8160a6..1b5b20f43b3e9 100644 --- a/datafusion/functions-aggregate/src/bool_and_or.rs +++ b/datafusion/functions-aggregate/src/bool_and_or.rs @@ -31,7 +31,6 @@ use arrow::datatypes::Field; use datafusion_common::internal_err; use datafusion_common::{downcast_value, not_impl_err}; use datafusion_common::{DataFusionError, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ @@ -39,7 +38,9 @@ use datafusion_expr::{ Signature, Volatility, }; +use datafusion_doc::DocSection; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::bool_op::BooleanGroupsAccumulator; +use datafusion_macros::user_doc; // returns the new value after bool_and/bool_or with the new values, taking nullability into account macro_rules! typed_bool_and_or_batch { @@ -92,6 +93,20 @@ make_udaf_expr_and_func!( bool_or_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns true if all non-null input values are true, otherwise false.", + syntax_example = "bool_and(expression)", + sql_example = r#"```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#, + standard_argument(name = "expression", prefix = "The") +)] /// BOOL_AND aggregate expression #[derive(Debug)] pub struct BoolAnd { @@ -178,34 +193,10 @@ impl AggregateUDFImpl for BoolAnd { } fn documentation(&self) -> Option<&Documentation> { - Some(get_bool_and_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_bool_and_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns true if all non-null input values are true, otherwise false.", - "bool_and(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT bool_and(column_name) FROM table_name; -+----------------------------+ -| bool_and(column_name) | -+----------------------------+ -| true | -+----------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug, Default)] struct BoolAndAccumulator { acc: Option, @@ -240,6 +231,20 @@ impl Accumulator for BoolAndAccumulator { } } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns true if all non-null input values are true, otherwise false.", + syntax_example = "bool_and(expression)", + sql_example = r#"```sql +> SELECT bool_and(column_name) FROM table_name; ++----------------------------+ +| bool_and(column_name) | ++----------------------------+ +| true | ++----------------------------+ +```"#, + standard_argument(name = "expression", prefix = "The") +)] /// BOOL_OR aggregate expression #[derive(Debug, Clone)] pub struct BoolOr { @@ -327,32 +332,10 @@ impl AggregateUDFImpl for BoolOr { } fn documentation(&self) -> Option<&Documentation> { - Some(get_bool_or_doc()) + self.doc() } } -fn get_bool_or_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns true if any non-null input value is true, otherwise false.", - "bool_or(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT bool_or(column_name) FROM table_name; -+----------------------------+ -| bool_or(column_name) | -+----------------------------+ -| true | -+----------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug, Default)] struct BoolOrAccumulator { acc: Option, diff --git a/datafusion/functions-aggregate/src/correlation.rs b/datafusion/functions-aggregate/src/correlation.rs index 4711b42407974..b40555bf6c7f0 100644 --- a/datafusion/functions-aggregate/src/correlation.rs +++ b/datafusion/functions-aggregate/src/correlation.rs @@ -31,7 +31,7 @@ use arrow::{ use crate::covariance::CovarianceAccumulator; use crate::stddev::StddevAccumulator; use datafusion_common::{plan_err, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; +use datafusion_doc::DocSection; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, @@ -39,6 +39,7 @@ use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( Correlation, @@ -48,6 +49,21 @@ make_udaf_expr_and_func!( corr_udaf ); +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the coefficient of correlation between two numeric values.", + syntax_example = "corr(expression1, expression2)", + sql_example = r#"```sql +> SELECT corr(column1, column2) FROM table_name; ++--------------------------------+ +| corr(column1, column2) | ++--------------------------------+ +| 0.85 | ++--------------------------------+ +```"#, + standard_argument(name = "expression1", prefix = "First"), + standard_argument(name = "expression2", prefix = "Second") +)] #[derive(Debug)] pub struct Correlation { signature: Signature, @@ -111,35 +127,10 @@ impl AggregateUDFImpl for Correlation { } fn documentation(&self) -> Option<&Documentation> { - Some(get_corr_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_corr_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the coefficient of correlation between two numeric values.", - "corr(expression1, expression2)", - ) - .with_sql_example( - r#"```sql -> SELECT corr(column1, column2) FROM table_name; -+--------------------------------+ -| corr(column1, column2) | -+--------------------------------+ -| 0.85 | -+--------------------------------+ -```"#, - ) - .with_standard_argument("expression1", Some("First")) - .with_standard_argument("expression2", Some("Second")) - .build() - }) -} - /// An accumulator to compute correlation #[derive(Debug)] pub struct CorrelationAccumulator { diff --git a/datafusion/functions-aggregate/src/count.rs b/datafusion/functions-aggregate/src/count.rs index c8f8c8153ce11..3006de27397e1 100644 --- a/datafusion/functions-aggregate/src/count.rs +++ b/datafusion/functions-aggregate/src/count.rs @@ -17,7 +17,9 @@ use ahash::RandomState; use datafusion_common::stats::Precision; +use datafusion_doc::DocSection; use datafusion_functions_aggregate_common::aggregate::count_distinct::BytesViewDistinctCountAccumulator; +use datafusion_macros::user_doc; use datafusion_physical_expr::expressions; use std::collections::HashSet; use std::fmt::Debug; @@ -45,7 +47,6 @@ use arrow::{ use datafusion_common::{ downcast_value, internal_err, not_impl_err, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, @@ -79,6 +80,27 @@ pub fn count_distinct(expr: Expr) -> Expr { )) } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`.", + syntax_example = "count(expression)", + sql_example = r#"```sql +> SELECT count(column_name) FROM table_name; ++-----------------------+ +| count(column_name) | ++-----------------------+ +| 100 | ++-----------------------+ + +> SELECT count(*) FROM table_name; ++------------------+ +| count(*) | ++------------------+ +| 120 | ++------------------+ +```"#, + standard_argument(name = "expression",) +)] pub struct Count { signature: Signature, } @@ -328,39 +350,10 @@ impl AggregateUDFImpl for Count { } fn documentation(&self) -> Option<&Documentation> { - Some(get_count_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_count_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the number of non-null values in the specified column. To include null values in the total count, use `count(*)`.", - - "count(expression)") - .with_sql_example(r#"```sql -> SELECT count(column_name) FROM table_name; -+-----------------------+ -| count(column_name) | -+-----------------------+ -| 100 | -+-----------------------+ - -> SELECT count(*) FROM table_name; -+------------------+ -| count(*) | -+------------------+ -| 120 | -+------------------+ -```"#) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug)] struct CountAccumulator { count: i64, diff --git a/datafusion/functions-aggregate/src/covariance.rs b/datafusion/functions-aggregate/src/covariance.rs index 0c29589e90956..adb546e4d9066 100644 --- a/datafusion/functions-aggregate/src/covariance.rs +++ b/datafusion/functions-aggregate/src/covariance.rs @@ -31,7 +31,7 @@ use datafusion_common::{ downcast_value, plan_err, unwrap_or_internal_err, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; +use datafusion_doc::DocSection; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, type_coercion::aggregates::NUMERICS, @@ -39,6 +39,7 @@ use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( CovarianceSample, @@ -56,6 +57,21 @@ make_udaf_expr_and_func!( covar_pop_udaf ); +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the sample covariance of a set of number pairs.", + syntax_example = "covar_samp(expression1, expression2)", + sql_example = r#"```sql +> SELECT covar_samp(column1, column2) FROM table_name; ++-----------------------------------+ +| covar_samp(column1, column2) | ++-----------------------------------+ +| 8.25 | ++-----------------------------------+ +```"#, + standard_argument(name = "expression1", prefix = "First"), + standard_argument(name = "expression2", prefix = "Second") +)] pub struct CovarianceSample { signature: Signature, aliases: Vec, @@ -129,21 +145,15 @@ impl AggregateUDFImpl for CovarianceSample { } fn documentation(&self) -> Option<&Documentation> { - Some(get_covar_samp_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_covar_samp_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the sample covariance of a set of number pairs.", - "covar_samp(expression1, expression2)", - ) - .with_sql_example( - r#"```sql +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the sample covariance of a set of number pairs.", + syntax_example = "covar_samp(expression1, expression2)", + sql_example = r#"```sql > SELECT covar_samp(column1, column2) FROM table_name; +-----------------------------------+ | covar_samp(column1, column2) | @@ -151,13 +161,9 @@ fn get_covar_samp_doc() -> &'static Documentation { | 8.25 | +-----------------------------------+ ```"#, - ) - .with_standard_argument("expression1", Some("First")) - .with_standard_argument("expression2", Some("Second")) - .build() - }) -} - + standard_argument(name = "expression1", prefix = "First"), + standard_argument(name = "expression2", prefix = "Second") +)] pub struct CovariancePopulation { signature: Signature, } @@ -227,33 +233,10 @@ impl AggregateUDFImpl for CovariancePopulation { } fn documentation(&self) -> Option<&Documentation> { - Some(get_covar_pop_doc()) + self.doc() } } -fn get_covar_pop_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the population covariance of a set of number pairs.", - "covar_pop(expression1, expression2)", - ) - .with_sql_example( - r#"```sql -> SELECT covar_pop(column1, column2) FROM table_name; -+-----------------------------------+ -| covar_pop(column1, column2) | -+-----------------------------------+ -| 7.63 | -+-----------------------------------+ -```"#, - ) - .with_standard_argument("expression1", Some("First")) - .with_standard_argument("expression2", Some("Second")) - .build() - }) -} - /// An accumulator to compute covariance /// The algorithm used is an online implementation and numerically stable. It is derived from the following paper /// for calculating variance: diff --git a/datafusion/functions-aggregate/src/first_last.rs b/datafusion/functions-aggregate/src/first_last.rs index b7001f52ac84b..f3e66edbc0091 100644 --- a/datafusion/functions-aggregate/src/first_last.rs +++ b/datafusion/functions-aggregate/src/first_last.rs @@ -29,7 +29,7 @@ use datafusion_common::utils::{compare_rows, get_row_at_idx}; use datafusion_common::{ arrow_datafusion_err, internal_err, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::{format_state_name, AggregateOrderSensitivity}; use datafusion_expr::{ @@ -37,6 +37,7 @@ use datafusion_expr::{ SortExpr, Volatility, }; use datafusion_functions_aggregate_common::utils::get_sort_options; +use datafusion_macros::user_doc; use datafusion_physical_expr_common::sort_expr::LexOrdering; create_func!(FirstValue, first_value_udaf); @@ -55,6 +56,20 @@ pub fn first_value(expression: Expr, order_by: Option>) -> Expr { } } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + syntax_example = "first_value(expression [ORDER BY expression])", + sql_example = r#"```sql +> SELECT first_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| first_value(column_name ORDER BY other_column)| ++-----------------------------------------------+ +| first_element | ++-----------------------------------------------+ +```"#, + standard_argument(name = "expression",) +)] pub struct FirstValue { signature: Signature, requirement_satisfied: bool, @@ -161,33 +176,10 @@ impl AggregateUDFImpl for FirstValue { } fn documentation(&self) -> Option<&Documentation> { - Some(get_first_value_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_first_value_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", - - "first_value(expression [ORDER BY expression])") - .with_sql_example(r#"```sql -> SELECT first_value(column_name ORDER BY other_column) FROM table_name; -+-----------------------------------------------+ -| first_value(column_name ORDER BY other_column)| -+-----------------------------------------------+ -| first_element | -+-----------------------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug)] pub struct FirstValueAccumulator { first: ScalarValue, @@ -372,6 +364,20 @@ make_udaf_expr_and_func!( last_value_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", + syntax_example = "last_value(expression [ORDER BY expression])", + sql_example = r#"```sql +> SELECT last_value(column_name ORDER BY other_column) FROM table_name; ++-----------------------------------------------+ +| last_value(column_name ORDER BY other_column) | ++-----------------------------------------------+ +| last_element | ++-----------------------------------------------+ +```"#, + standard_argument(name = "expression",) +)] pub struct LastValue { signature: Signature, requirement_satisfied: bool, @@ -483,31 +489,10 @@ impl AggregateUDFImpl for LastValue { } fn documentation(&self) -> Option<&Documentation> { - Some(get_last_value_doc()) + self.doc() } } -fn get_last_value_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group.", - - "last_value(expression [ORDER BY expression])") - .with_sql_example(r#"```sql -> SELECT last_value(column_name ORDER BY other_column) FROM table_name; -+-----------------------------------------------+ -| last_value(column_name ORDER BY other_column) | -+-----------------------------------------------+ -| last_element | -+-----------------------------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - #[derive(Debug)] struct LastValueAccumulator { last: ScalarValue, diff --git a/datafusion/functions-aggregate/src/grouping.rs b/datafusion/functions-aggregate/src/grouping.rs index 4a45890b0e70c..36bdf68c1b0ec 100644 --- a/datafusion/functions-aggregate/src/grouping.rs +++ b/datafusion/functions-aggregate/src/grouping.rs @@ -24,13 +24,14 @@ use std::sync::OnceLock; use arrow::datatypes::DataType; use arrow::datatypes::Field; use datafusion_common::{not_impl_err, Result}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( Grouping, @@ -40,6 +41,27 @@ make_udaf_expr_and_func!( grouping_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set.", + syntax_example = "grouping(expression)", + sql_example = r#"```sql +> SELECT column_name, GROUPING(column_name) AS group_column + FROM table_name + GROUP BY GROUPING SETS ((column_name), ()); ++-------------+-------------+ +| column_name | group_column | ++-------------+-------------+ +| value1 | 0 | +| value2 | 0 | +| NULL | 1 | ++-------------+-------------+ +```"#, + argument( + name = "expression", + description = "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function." + ) +)] pub struct Grouping { signature: Signature, } @@ -100,33 +122,6 @@ impl AggregateUDFImpl for Grouping { } fn documentation(&self) -> Option<&Documentation> { - Some(get_grouping_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_grouping_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns 1 if the data is aggregated across the specified column, or 0 if it is not aggregated in the result set.", - - "grouping(expression)") - .with_sql_example(r#"```sql -> SELECT column_name, GROUPING(column_name) AS group_column - FROM table_name - GROUP BY GROUPING SETS ((column_name), ()); -+-------------+-------------+ -| column_name | group_column | -+-------------+-------------+ -| value1 | 0 | -| value2 | 0 | -| NULL | 1 | -+-------------+-------------+ -```"#, - ) - .with_argument("expression", "Expression to evaluate whether data is aggregated across the specified column. Can be a constant, column, or function.") - .build() - }) -} diff --git a/datafusion/functions-aggregate/src/median.rs b/datafusion/functions-aggregate/src/median.rs index bcffb19b75593..81e352433a0e2 100644 --- a/datafusion/functions-aggregate/src/median.rs +++ b/datafusion/functions-aggregate/src/median.rs @@ -34,13 +34,14 @@ use arrow::array::ArrowNativeTypeOp; use arrow::datatypes::{ArrowNativeType, ArrowPrimitiveType}; use datafusion_common::{DataFusionError, HashSet, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::{ function::AccumulatorArgs, utils::format_state_name, Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_functions_aggregate_common::utils::Hashable; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( Median, @@ -50,6 +51,20 @@ make_udaf_expr_and_func!( median_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the median value in the specified column.", + syntax_example = "median(expression)", + sql_example = r#"```sql +> SELECT median(column_name) FROM table_name; ++----------------------+ +| median(column_name) | ++----------------------+ +| 45.5 | ++----------------------+ +```"#, + standard_argument(name = "expression", prefix = "The") +)] /// MEDIAN aggregate expression. If using the non-distinct variation, then this uses a /// lot of memory because all values need to be stored in memory before a result can be /// computed. If an approximation is sufficient then APPROX_MEDIAN provides a much more @@ -156,34 +171,10 @@ impl AggregateUDFImpl for Median { } fn documentation(&self) -> Option<&Documentation> { - Some(get_median_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_median_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the median value in the specified column.", - "median(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT median(column_name) FROM table_name; -+----------------------+ -| median(column_name) | -+----------------------+ -| 45.5 | -+----------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - /// The median accumulator accumulates the raw input values /// as `ScalarValue`s /// diff --git a/datafusion/functions-aggregate/src/min_max.rs b/datafusion/functions-aggregate/src/min_max.rs index 2077f15674111..acbeebaad68b9 100644 --- a/datafusion/functions-aggregate/src/min_max.rs +++ b/datafusion/functions-aggregate/src/min_max.rs @@ -41,7 +41,6 @@ use datafusion_common::stats::Precision; use datafusion_common::{ downcast_value, exec_err, internal_err, ColumnStatistics, DataFusionError, Result, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_physical_expr::expressions; use std::cmp::Ordering; @@ -56,11 +55,13 @@ use arrow::datatypes::{ use crate::min_max::min_max_bytes::MinMaxBytesAccumulator; use datafusion_common::ScalarValue; +use datafusion_doc::DocSection; use datafusion_expr::{ function::AccumulatorArgs, Accumulator, AggregateUDFImpl, Documentation, Signature, Volatility, }; use datafusion_expr::{GroupsAccumulator, StatisticsArgs}; +use datafusion_macros::user_doc; use half::f16; use std::mem::size_of_val; use std::ops::Deref; @@ -87,6 +88,20 @@ fn get_min_max_result_type(input_types: &[DataType]) -> Result> { } } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the maximum value in the specified column.", + syntax_example = "max(expression)", + sql_example = r#"```sql +> SELECT max(column_name) FROM table_name; ++----------------------+ +| max(column_name) | ++----------------------+ +| 150 | ++----------------------+ +```"#, + standard_argument(name = "expression",) +)] // MAX aggregate UDF #[derive(Debug)] pub struct Max { @@ -346,34 +361,10 @@ impl AggregateUDFImpl for Max { } fn documentation(&self) -> Option<&Documentation> { - Some(get_max_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_max_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the maximum value in the specified column.", - "max(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT max(column_name) FROM table_name; -+----------------------+ -| max(column_name) | -+----------------------+ -| 150 | -+----------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - // Statically-typed version of min/max(array) -> ScalarValue for string types macro_rules! typed_min_max_batch_string { ($VALUES:expr, $ARRAYTYPE:ident, $SCALAR:ident, $OP:ident) => {{ @@ -997,6 +988,20 @@ impl Accumulator for SlidingMaxAccumulator { } } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the minimum value in the specified column.", + syntax_example = "min(expression)", + sql_example = r#"```sql +> SELECT min(column_name) FROM table_name; ++----------------------+ +| min(column_name) | ++----------------------+ +| 12 | ++----------------------+ +```"#, + standard_argument(name = "expression",) +)] #[derive(Debug)] pub struct Min { signature: Signature, @@ -1178,32 +1183,10 @@ impl AggregateUDFImpl for Min { } fn documentation(&self) -> Option<&Documentation> { - Some(get_min_doc()) + self.doc() } } -fn get_min_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the minimum value in the specified column.", - "min(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT min(column_name) FROM table_name; -+----------------------+ -| min(column_name) | -+----------------------+ -| 12 | -+----------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - /// An accumulator to compute the minimum value #[derive(Debug)] pub struct MinAccumulator { diff --git a/datafusion/functions-aggregate/src/nth_value.rs b/datafusion/functions-aggregate/src/nth_value.rs index 0c72939633b16..ddb9c0e67d545 100644 --- a/datafusion/functions-aggregate/src/nth_value.rs +++ b/datafusion/functions-aggregate/src/nth_value.rs @@ -28,7 +28,7 @@ use arrow_schema::{DataType, Field, Fields}; use datafusion_common::utils::{array_into_list_array_nullable, get_row_at_idx}; use datafusion_common::{exec_err, internal_err, not_impl_err, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ @@ -37,6 +37,7 @@ use datafusion_expr::{ }; use datafusion_functions_aggregate_common::merge_arrays::merge_ordered_arrays; use datafusion_functions_aggregate_common::utils::ordering_fields; +use datafusion_macros::user_doc; use datafusion_physical_expr::expressions::Literal; use datafusion_physical_expr_common::sort_expr::{LexOrdering, PhysicalSortExpr}; @@ -60,6 +61,32 @@ pub fn nth_value( } } +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the nth value in a group of values.", + syntax_example = "nth_value(expression, n ORDER BY expression)", + sql_example = r#"```sql +> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept + FROM employee; ++---------+--------+-------------------------+ +| dept_id | salary | second_salary_by_dept | ++---------+--------+-------------------------+ +| 1 | 30000 | NULL | +| 1 | 40000 | 40000 | +| 1 | 50000 | 40000 | +| 2 | 35000 | NULL | +| 2 | 45000 | 45000 | ++---------+--------+-------------------------+ +```"#, + argument( + name = "expression", + description = "The column or expression to retrieve the nth value from." + ), + argument( + name = "n", + description = "The position (nth) of the value to retrieve, based on the ordering." + ) +)] /// Expression for a `NTH_VALUE(... ORDER BY ..., ...)` aggregation. In a multi /// partition setting, partial aggregations are computed for every partition, /// and then their results are merged. @@ -165,38 +192,10 @@ impl AggregateUDFImpl for NthValueAgg { } fn documentation(&self) -> Option<&Documentation> { - Some(get_nth_value_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_nth_value_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the nth value in a group of values.", - - "nth_value(expression, n ORDER BY expression)") - .with_sql_example(r#"```sql -> SELECT dept_id, salary, NTH_VALUE(salary, 2) OVER (PARTITION BY dept_id ORDER BY salary ASC) AS second_salary_by_dept - FROM employee; -+---------+--------+-------------------------+ -| dept_id | salary | second_salary_by_dept | -+---------+--------+-------------------------+ -| 1 | 30000 | NULL | -| 1 | 40000 | 40000 | -| 1 | 50000 | 40000 | -| 2 | 35000 | NULL | -| 2 | 45000 | 45000 | -+---------+--------+-------------------------+ -```"#) - .with_argument("expression", "The column or expression to retrieve the nth value from.") - .with_argument("n", "The position (nth) of the value to retrieve, based on the ordering.") - .build() - }) -} - #[derive(Debug)] pub struct NthValueAccumulator { /// The `N` value. diff --git a/datafusion/functions-aggregate/src/stddev.rs b/datafusion/functions-aggregate/src/stddev.rs index afc9bf6255c28..09a39e342cce6 100644 --- a/datafusion/functions-aggregate/src/stddev.rs +++ b/datafusion/functions-aggregate/src/stddev.rs @@ -27,7 +27,7 @@ use arrow::{array::ArrayRef, datatypes::DataType, datatypes::Field}; use datafusion_common::{internal_err, not_impl_err, Result}; use datafusion_common::{plan_err, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_STATISTICAL; +use datafusion_doc::DocSection; use datafusion_expr::function::{AccumulatorArgs, StateFieldsArgs}; use datafusion_expr::utils::format_state_name; use datafusion_expr::{ @@ -35,6 +35,7 @@ use datafusion_expr::{ Volatility, }; use datafusion_functions_aggregate_common::stats::StatsType; +use datafusion_macros::user_doc; use crate::variance::{VarianceAccumulator, VarianceGroupsAccumulator}; @@ -46,6 +47,20 @@ make_udaf_expr_and_func!( stddev_udaf ); +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the standard deviation of a set of numbers.", + syntax_example = "stddev(expression)", + sql_example = r#"```sql +> SELECT stddev(column_name) FROM table_name; ++----------------------+ +| stddev(column_name) | ++----------------------+ +| 12.34 | ++----------------------+ +```"#, + standard_argument(name = "expression",) +)] /// STDDEV and STDDEV_SAMP (standard deviation) aggregate expression pub struct Stddev { signature: Signature, @@ -134,34 +149,10 @@ impl AggregateUDFImpl for Stddev { } fn documentation(&self) -> Option<&Documentation> { - Some(get_stddev_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_stddev_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the standard deviation of a set of numbers.", - "stddev(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT stddev(column_name) FROM table_name; -+----------------------+ -| stddev(column_name) | -+----------------------+ -| 12.34 | -+----------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - make_udaf_expr_and_func!( StddevPop, stddev_pop, @@ -170,6 +161,20 @@ make_udaf_expr_and_func!( stddev_pop_udaf ); +#[user_doc( + doc_section(label = "Statistical Functions"), + description = "Returns the population standard deviation of a set of numbers.", + syntax_example = "stddev_pop(expression)", + sql_example = r#"```sql +> SELECT stddev_pop(column_name) FROM table_name; ++--------------------------+ +| stddev_pop(column_name) | ++--------------------------+ +| 10.56 | ++--------------------------+ +```"#, + standard_argument(name = "expression",) +)] /// STDDEV_POP population aggregate expression pub struct StddevPop { signature: Signature, @@ -258,32 +263,10 @@ impl AggregateUDFImpl for StddevPop { } fn documentation(&self) -> Option<&Documentation> { - Some(get_stddev_pop_doc()) + self.doc() } } -fn get_stddev_pop_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_STATISTICAL, - "Returns the population standard deviation of a set of numbers.", - "stddev_pop(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT stddev_pop(column_name) FROM table_name; -+--------------------------+ -| stddev_pop(column_name) | -+--------------------------+ -| 10.56 | -+--------------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - /// An accumulator to compute the average #[derive(Debug)] pub struct StddevAccumulator { diff --git a/datafusion/functions-aggregate/src/string_agg.rs b/datafusion/functions-aggregate/src/string_agg.rs index 4fd2d91b46c0f..5a52bec55f157 100644 --- a/datafusion/functions-aggregate/src/string_agg.rs +++ b/datafusion/functions-aggregate/src/string_agg.rs @@ -22,11 +22,12 @@ use arrow_schema::DataType; use datafusion_common::cast::as_generic_string_array; use datafusion_common::Result; use datafusion_common::{not_impl_err, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::{ Accumulator, AggregateUDFImpl, Documentation, Signature, TypeSignature, Volatility, }; +use datafusion_macros::user_doc; use datafusion_physical_expr::expressions::Literal; use std::any::Any; use std::mem::size_of_val; @@ -40,6 +41,28 @@ make_udaf_expr_and_func!( string_agg_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Concatenates the values of string expressions and places separator values between them.", + syntax_example = "string_agg(expression, delimiter)", + sql_example = r#"```sql +> SELECT string_agg(name, ', ') AS names_list + FROM employee; ++--------------------------+ +| names_list | ++--------------------------+ +| Alice, Bob, Charlie | ++--------------------------+ +```"#, + argument( + name = "expression", + description = "The string expression to concatenate. Can be a column or any valid string expression." + ), + argument( + name = "delimiter", + description = "A literal string used as a separator between the concatenated values." + ) +)] /// STRING_AGG aggregate expression #[derive(Debug)] pub struct StringAgg { @@ -103,35 +126,10 @@ impl AggregateUDFImpl for StringAgg { } fn documentation(&self) -> Option<&Documentation> { - Some(get_string_agg_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_string_agg_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Concatenates the values of string expressions and places separator values between them.", - - "string_agg(expression, delimiter)") - .with_sql_example(r#"```sql -> SELECT string_agg(name, ', ') AS names_list - FROM employee; -+--------------------------+ -| names_list | -+--------------------------+ -| Alice, Bob, Charlie | -+--------------------------+ -```"#, - ) - .with_argument("expression", "The string expression to concatenate. Can be a column or any valid string expression.") - .with_argument("delimiter", "A literal string used as a separator between the concatenated values.") - .build() - }) -} - #[derive(Debug)] pub(crate) struct StringAggAccumulator { values: Option, diff --git a/datafusion/functions-aggregate/src/sum.rs b/datafusion/functions-aggregate/src/sum.rs index 447b5d8a57c44..ec4557d73b7ab 100644 --- a/datafusion/functions-aggregate/src/sum.rs +++ b/datafusion/functions-aggregate/src/sum.rs @@ -35,7 +35,7 @@ use arrow::datatypes::{ }; use arrow::{array::ArrayRef, datatypes::Field}; use datafusion_common::{exec_err, not_impl_err, Result, ScalarValue}; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::function::AccumulatorArgs; use datafusion_expr::function::StateFieldsArgs; use datafusion_expr::utils::format_state_name; @@ -45,6 +45,7 @@ use datafusion_expr::{ }; use datafusion_functions_aggregate_common::aggregate::groups_accumulator::prim_op::PrimitiveGroupsAccumulator; use datafusion_functions_aggregate_common::utils::Hashable; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( Sum, @@ -79,6 +80,20 @@ macro_rules! downcast_sum { }; } +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the sum of all values in the specified column.", + syntax_example = "sum(expression)", + sql_example = r#"```sql +> SELECT sum(column_name) FROM table_name; ++-----------------------+ +| sum(column_name) | ++-----------------------+ +| 12345 | ++-----------------------+ +```"#, + standard_argument(name = "expression",) +)] #[derive(Debug)] pub struct Sum { signature: Signature, @@ -239,34 +254,10 @@ impl AggregateUDFImpl for Sum { } fn documentation(&self) -> Option<&Documentation> { - Some(get_sum_doc()) + self.doc() } } -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_sum_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the sum of all values in the specified column.", - "sum(expression)", - ) - .with_sql_example( - r#"```sql -> SELECT sum(column_name) FROM table_name; -+-----------------------+ -| sum(column_name) | -+-----------------------+ -| 12345 | -+-----------------------+ -```"#, - ) - .with_standard_argument("expression", None) - .build() - }) -} - /// This accumulator computes SUM incrementally struct SumAccumulator { sum: Option, diff --git a/datafusion/functions-aggregate/src/variance.rs b/datafusion/functions-aggregate/src/variance.rs index 9c99a9138dd5d..70b10734088fc 100644 --- a/datafusion/functions-aggregate/src/variance.rs +++ b/datafusion/functions-aggregate/src/variance.rs @@ -31,7 +31,7 @@ use std::{fmt::Debug, sync::Arc}; use datafusion_common::{ downcast_value, not_impl_err, plan_err, DataFusionError, Result, ScalarValue, }; -use datafusion_expr::aggregate_doc_sections::DOC_SECTION_GENERAL; +use datafusion_doc::DocSection; use datafusion_expr::{ function::{AccumulatorArgs, StateFieldsArgs}, utils::format_state_name, @@ -41,6 +41,7 @@ use datafusion_expr::{ use datafusion_functions_aggregate_common::{ aggregate::groups_accumulator::accumulate::accumulate, stats::StatsType, }; +use datafusion_macros::user_doc; make_udaf_expr_and_func!( VarianceSample, @@ -58,6 +59,12 @@ make_udaf_expr_and_func!( var_pop_udaf ); +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the statistical sample variance of a set of numbers.", + syntax_example = "var(expression)", + standard_argument(name = "expression", prefix = "Numeric") +)] pub struct VarianceSample { signature: Signature, aliases: Vec, @@ -137,24 +144,16 @@ impl AggregateUDFImpl for VarianceSample { } fn documentation(&self) -> Option<&Documentation> { - Some(get_variance_sample_doc()) + self.doc() } } -static VARIANCE_SAMPLE_DOC: OnceLock = OnceLock::new(); - -fn get_variance_sample_doc() -> &'static Documentation { - VARIANCE_SAMPLE_DOC.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the statistical sample variance of a set of numbers.", - "var(expression)", - ) - .with_standard_argument("expression", Some("Numeric")) - .build() - }) -} - +#[user_doc( + doc_section(label = "General Functions"), + description = "Returns the statistical population variance of a set of numbers.", + syntax_example = "var_pop(expression)", + standard_argument(name = "expression", prefix = "Numeric") +)] pub struct VariancePopulation { signature: Signature, aliases: Vec, @@ -241,24 +240,10 @@ impl AggregateUDFImpl for VariancePopulation { ))) } fn documentation(&self) -> Option<&Documentation> { - Some(get_variance_population_doc()) + self.doc() } } -static VARIANCE_POPULATION_DOC: OnceLock = OnceLock::new(); - -fn get_variance_population_doc() -> &'static Documentation { - VARIANCE_POPULATION_DOC.get_or_init(|| { - Documentation::builder( - DOC_SECTION_GENERAL, - "Returns the statistical population variance of a set of numbers.", - "var_pop(expression)", - ) - .with_standard_argument("expression", Some("Numeric")) - .build() - }) -} - /// An accumulator to compute variance /// The algorithm used is an online implementation and numerically stable. It is based on this paper: /// Welford, B. P. (1962). "Note on a method for calculating corrected sums of squares and products". diff --git a/datafusion/functions/src/datetime/to_date.rs b/datafusion/functions/src/datetime/to_date.rs index 15f1159ef9841..e2edea843e980 100644 --- a/datafusion/functions/src/datetime/to_date.rs +++ b/datafusion/functions/src/datetime/to_date.rs @@ -22,6 +22,7 @@ use arrow::error::ArrowError::ParseError; use arrow::{array::types::Date32Type, compute::kernels::cast_utils::Parser}; use datafusion_common::error::DataFusionError; use datafusion_common::{arrow_err, exec_err, internal_datafusion_err, Result}; +use datafusion_doc::DocSection; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; diff --git a/datafusion/functions/src/math/abs.rs b/datafusion/functions/src/math/abs.rs index 3565afbe6b48c..c0c7c6f0f6b6b 100644 --- a/datafusion/functions/src/math/abs.rs +++ b/datafusion/functions/src/math/abs.rs @@ -27,12 +27,13 @@ use arrow::array::{ use arrow::datatypes::DataType; use arrow::error::ArrowError; use datafusion_common::{exec_err, not_impl_err, DataFusionError, Result}; +use datafusion_doc::DocSection; use datafusion_expr::interval_arithmetic::Interval; -use datafusion_expr::scalar_doc_sections::DOC_SECTION_MATH; use datafusion_expr::sort_properties::{ExprProperties, SortProperties}; use datafusion_expr::{ ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, }; +use datafusion_macros::user_doc; type MathArrayFunction = fn(&Vec) -> Result; @@ -103,6 +104,12 @@ fn create_abs_function(input_data_type: &DataType) -> Result other => not_impl_err!("Unsupported data type {other:?} for function abs"), } } +#[user_doc( + doc_section(label = "Math Functions"), + description = "Returns the absolute value of a number.", + syntax_example = "abs(numeric_expression)", + standard_argument(name = "numeric_expression", prefix = "Numeric") +)] #[derive(Debug)] pub struct AbsFunc { signature: Signature, @@ -193,20 +200,6 @@ impl ScalarUDFImpl for AbsFunc { } fn documentation(&self) -> Option<&Documentation> { - Some(get_abs_doc()) + self.doc() } } - -static DOCUMENTATION: OnceLock = OnceLock::new(); - -fn get_abs_doc() -> &'static Documentation { - DOCUMENTATION.get_or_init(|| { - Documentation::builder( - DOC_SECTION_MATH, - "Returns the absolute value of a number.", - "abs(numeric_expression)", - ) - .with_standard_argument("numeric_expression", Some("Numeric")) - .build() - }) -} diff --git a/datafusion/functions/src/string/ltrim.rs b/datafusion/functions/src/string/ltrim.rs index 93470368803ae..e0e83d1b01e35 100644 --- a/datafusion/functions/src/string/ltrim.rs +++ b/datafusion/functions/src/string/ltrim.rs @@ -22,6 +22,7 @@ use std::any::Any; use crate::string::common::*; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::{exec_err, Result}; +use datafusion_doc::DocSection; use datafusion_expr::function::Hint; use datafusion_expr::{ColumnarValue, Documentation, TypeSignature, Volatility}; use datafusion_expr::{ScalarUDFImpl, Signature}; diff --git a/datafusion/macros/src/lib.rs b/datafusion/macros/src/lib.rs index e4eeeba7a4abc..54b688ac2a497 100644 --- a/datafusion/macros/src/lib.rs +++ b/datafusion/macros/src/lib.rs @@ -190,6 +190,12 @@ pub fn user_doc(args: TokenStream, input: TokenStream) -> TokenStream { .map(|desc| quote! { Some(#desc)}) .unwrap_or(quote! { None }); + let sql_example = sql_example.map(|ex| { + quote! { + .with_sql_example(#ex) + } + }); + let udf_args = udf_args .iter() .map(|(name, desc)| { @@ -202,8 +208,14 @@ pub fn user_doc(args: TokenStream, input: TokenStream) -> TokenStream { let standard_args = standard_args .iter() .map(|(name, desc)| { + let desc = if let Some(d) = desc { + quote! { #d.into() } + } else { + quote! { None } + }; + quote! { - .with_standard_argument(#name, #desc.into()) + .with_standard_argument(#name, #desc) } }) .collect::>(); @@ -223,20 +235,21 @@ pub fn user_doc(args: TokenStream, input: TokenStream) -> TokenStream { } }); + let lock_name: proc_macro2::TokenStream = + format!("{name}_DOCUMENTATION").parse().unwrap(); + let generated = quote! { #input - use datafusion_doc::DocSection; - use datafusion_doc::DocumentationBuilder; - - static DOCUMENTATION: OnceLock = OnceLock::new(); + static #lock_name: OnceLock = OnceLock::new(); impl #name { + fn doc(&self) -> Option<&Documentation> { - Some(DOCUMENTATION.get_or_init(|| { + Some(#lock_name.get_or_init(|| { Documentation::builder(DocSection { include: #doc_section_include, label: #doc_section_lbl, description: #doc_section_description }, #description.to_string(), #syntax_example.to_string()) - .with_sql_example(#sql_example.to_string()) + #sql_example #alt_syntax_example #(#standard_args)* #(#udf_args)* @@ -248,7 +261,9 @@ pub fn user_doc(args: TokenStream, input: TokenStream) -> TokenStream { }; // Debug the generated code if needed - //eprintln!("Generated code: {}", generated); + // if name == "ArrayAgg" { + // eprintln!("Generated code: {}", generated); + // } // Return the generated code TokenStream::from(generated) diff --git a/docs/source/user-guide/sql/aggregate_functions.md b/docs/source/user-guide/sql/aggregate_functions.md index d9fc28a81772d..221bb0572eb84 100644 --- a/docs/source/user-guide/sql/aggregate_functions.md +++ b/docs/source/user-guide/sql/aggregate_functions.md @@ -268,10 +268,10 @@ grouping(expression) ### `last_value` -Returns the first element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. +Returns the last element in an aggregation group according to the requested ordering. If no ordering is given, returns an arbitrary element from the group. ``` -first_value(expression [ORDER BY expression]) +last_value(expression [ORDER BY expression]) ``` #### Arguments @@ -281,11 +281,11 @@ first_value(expression [ORDER BY expression]) #### Example ```sql -> SELECT first_value(column_name ORDER BY other_column) FROM table_name; +> SELECT last_value(column_name ORDER BY other_column) FROM table_name; +-----------------------------------------------+ -| first_value(column_name ORDER BY other_column)| +| last_value(column_name ORDER BY other_column) | +-----------------------------------------------+ -| first_element | +| last_element | +-----------------------------------------------+ ``` @@ -341,10 +341,10 @@ median(expression) ### `min` -Returns the maximum value in the specified column. +Returns the minimum value in the specified column. ``` -max(expression) +min(expression) ``` #### Arguments @@ -354,11 +354,11 @@ max(expression) #### Example ```sql -> SELECT max(column_name) FROM table_name; +> SELECT min(column_name) FROM table_name; +----------------------+ -| max(column_name) | +| min(column_name) | +----------------------+ -| 150 | +| 12 | +----------------------+ ``` @@ -730,10 +730,10 @@ stddev(expression) ### `stddev_pop` -Returns the standard deviation of a set of numbers. +Returns the population standard deviation of a set of numbers. ``` -stddev(expression) +stddev_pop(expression) ``` #### Arguments @@ -743,12 +743,12 @@ stddev(expression) #### Example ```sql -> SELECT stddev(column_name) FROM table_name; -+----------------------+ -| stddev(column_name) | -+----------------------+ -| 12.34 | -+----------------------+ +> SELECT stddev_pop(column_name) FROM table_name; ++--------------------------+ +| stddev_pop(column_name) | ++--------------------------+ +| 10.56 | ++--------------------------+ ``` ### `stddev_samp`