From 13ad9db3b6cf38a509901f92b24504a4f8c0802b Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 16 Jan 2025 14:58:30 +0100 Subject: [PATCH 01/33] Reset branch to no changes other than SqlRel --- datafusion/common/src/diagnostic.rs | 56 ++++++++++++++++++++++++++++ datafusion/common/src/error.rs | 57 ++++++++++++++++++++++++++++- datafusion/common/src/lib.rs | 2 + datafusion/sql/src/parser.rs | 4 +- 4 files changed, 116 insertions(+), 3 deletions(-) create mode 100644 datafusion/common/src/diagnostic.rs diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs new file mode 100644 index 000000000000..ad6dbd8d500b --- /dev/null +++ b/datafusion/common/src/diagnostic.rs @@ -0,0 +1,56 @@ +use sqlparser::tokenizer::Span; + +#[derive(Debug, Clone)] +pub struct Diagnostic { + pub entries: Vec, +} + +#[derive(Debug, Clone)] +pub struct DiagnosticEntry { + pub span: Span, + pub message: String, + pub kind: DiagnosticEntryKind, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DiagnosticEntryKind { + Error, + Warning, + Note, + Help, +} + +impl Diagnostic { + pub fn new(entries: impl IntoIterator) -> Self { + Diagnostic::from_iter(entries) + } +} + +impl FromIterator for Diagnostic { + fn from_iter>(iter: T) -> Self { + Diagnostic { + entries: iter.into_iter().collect(), + } + } +} + +impl DiagnosticEntry { + pub fn new( + message: impl Into, + kind: DiagnosticEntryKind, + span: Span, + ) -> Self { + DiagnosticEntry { + span, + message: message.into(), + kind, + } + } + + pub fn new_without_span( + message: impl Into, + kind: DiagnosticEntryKind, + ) -> Self { + Self::new(message, kind, Span::empty()) + } +} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index f7c247aaf288..26479d8d3ada 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -27,7 +27,7 @@ use std::result; use std::sync::Arc; use crate::utils::quote_identifier; -use crate::{Column, DFSchema, TableReference}; +use crate::{Column, DFSchema, Diagnostic, TableReference}; #[cfg(feature = "avro")] use apache_avro::Error as AvroError; use arrow::error::ArrowError; @@ -131,6 +131,11 @@ pub enum DataFusionError { /// Errors from either mapping LogicalPlans to/from Substrait plans /// or serializing/deserializing protobytes to Substrait plans Substrait(String), + /// Error wrapped together with additional contextual information intended + /// for end users, to help them understand what went wrong by providing + /// human-readable messages, and locations in the source query that relate + /// to the error in some way. + Diagnostic(Diagnostic, Box), } #[macro_export] @@ -328,6 +333,7 @@ impl Error for DataFusionError { DataFusionError::External(e) => Some(e.as_ref()), DataFusionError::Context(_, e) => Some(e.as_ref()), DataFusionError::Substrait(_) => None, + DataFusionError::Diagnostic(_, e) => Some(e.as_ref()), } } } @@ -441,6 +447,7 @@ impl DataFusionError { DataFusionError::External(_) => "External error: ", DataFusionError::Context(_, _) => "", DataFusionError::Substrait(_) => "Substrait error: ", + DataFusionError::Diagnostic(_, _) => "", } } @@ -481,8 +488,56 @@ impl DataFusionError { Cow::Owned(format!("{desc}\ncaused by\n{}", *err)) } DataFusionError::Substrait(ref desc) => Cow::Owned(desc.to_string()), + DataFusionError::Diagnostic(_, ref err) => Cow::Owned(err.to_string()), } } + + /// Wraps the error with contextual information intended for end users + pub fn with_diagnostic(self, diagnostic: Diagnostic) -> Self { + Self::Diagnostic(diagnostic, Box::new(self)) + } + + /// Wraps the error with contextual information intended for end users. + /// Takes a function that inspects the error and returns the diagnostic to + /// wrap it with. + pub fn with_diagnostic_fn Diagnostic>( + self, + f: F, + ) -> Self { + let diagnostic = f(&self); + self.with_diagnostic(diagnostic) + } + + pub fn get_diagnostics(&self) -> impl Iterator + '_ { + struct DiagnosticsIterator<'a> { + head: &'a DataFusionError, + } + + impl<'a> Iterator for DiagnosticsIterator<'a> { + type Item = &'a Diagnostic; + + fn next(&mut self) -> Option { + loop { + if let DataFusionError::Diagnostic(diagnostics, source) = self.head { + self.head = source.as_ref(); + return Some(diagnostics); + } + + if let Some(source) = self + .head + .source() + .and_then(|source| source.downcast_ref::()) + { + self.head = source; + } else { + return None; + } + } + } + } + + DiagnosticsIterator { head: self } + } } /// Unwrap an `Option` if possible. Otherwise return an `DataFusionError::Internal`. diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 77e8cd60ede2..02d4a26b1591 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -47,6 +47,7 @@ pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; +pub mod diagnostic; /// Reexport arrow crate pub use arrow; @@ -76,6 +77,7 @@ pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; +pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. // These are just re-exports of macros by the same name, which gets around the 'cannot refer to diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index f185d65fa194..e836fa89bf75 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -284,10 +284,10 @@ impl<'a> DFParser<'a> { dialect: &'a dyn Dialect, ) -> Result { let mut tokenizer = Tokenizer::new(dialect, sql); - let tokens = tokenizer.tokenize()?; + let tokens = tokenizer.tokenize_with_location()?; Ok(DFParser { - parser: Parser::new(dialect).with_tokens(tokens), + parser: Parser::new(dialect).with_tokens_with_locations(tokens), }) } From 4603b352ed2bf270ea0571f1b2bd9fea8b5b2c4c Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 16 Jan 2025 15:53:29 +0100 Subject: [PATCH 02/33] feat: improve `Diagnostic` ergonomics --- datafusion/common/src/diagnostic.rs | 33 +++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index ad6dbd8d500b..9d4d39fca841 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -21,8 +21,16 @@ pub enum DiagnosticEntryKind { } impl Diagnostic { - pub fn new(entries: impl IntoIterator) -> Self { - Diagnostic::from_iter(entries) + pub fn new() -> Self { + Default::default() + } +} + +impl Default for Diagnostic { + fn default() -> Self { + Diagnostic { + entries: Vec::new(), + } } } @@ -34,6 +42,27 @@ impl FromIterator for Diagnostic { } } +macro_rules! with_kind { + ($name:ident, $kind:expr) => { + pub fn $name(mut self, message: impl Into, span: Span) -> Self { + let entry = DiagnosticEntry { + span, + message: message.into(), + kind: $kind, + }; + self.entries.push(entry); + self + } + }; +} + +impl Diagnostic { + with_kind!(with_error, DiagnosticEntryKind::Error); + with_kind!(with_warning, DiagnosticEntryKind::Warning); + with_kind!(with_note, DiagnosticEntryKind::Note); + with_kind!(with_help, DiagnosticEntryKind::Help); +} + impl DiagnosticEntry { pub fn new( message: impl Into, From 86e1722d5a0b49cda7e062f0435fbb38ece1aa52 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 16 Jan 2025 15:53:44 +0100 Subject: [PATCH 03/33] feat: 'table not found' diagnostic in `SqlToRel` --- datafusion/sql/src/relation/mod.rs | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 8915b0069269..4d2ee3cc942b 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -20,11 +20,14 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{not_impl_err, plan_err, DFSchema, Result, TableReference}; +use datafusion_common::{ + not_impl_err, plan_err, DFSchema, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, + Result, TableReference, +}; use datafusion_expr::builder::subquery_alias; use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::{Subquery, SubqueryAlias}; -use sqlparser::ast::{FunctionArg, FunctionArgExpr, TableFactor}; +use sqlparser::ast::{FunctionArg, FunctionArgExpr, Spanned, TableFactor}; mod join; @@ -35,6 +38,7 @@ impl SqlToRel<'_, S> { relation: TableFactor, planner_context: &mut PlannerContext, ) -> Result { + let relation_span = relation.span(); let (plan, alias) = match relation { TableFactor::Table { name, alias, args, .. @@ -80,11 +84,19 @@ impl SqlToRel<'_, S> { self.context_provider.get_table_source(table_ref.clone()), ) { (Some(cte_plan), _) => Ok(cte_plan.clone()), - (_, Ok(provider)) => { - LogicalPlanBuilder::scan(table_ref, provider, None)? - .build() + (_, Ok(provider)) => LogicalPlanBuilder::scan( + table_ref.clone(), + provider, + None, + )? + .build(), + (None, Err(e)) => { + let e = e.with_diagnostic(Diagnostic::new().with_error( + format!("table '{}' not found", table_ref), + relation_span, + )); + Err(e) } - (None, Err(e)) => Err(e), }?, alias, ) From c3532c10a04a437cc75c74b3f9c38ac4107de1e2 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 17 Jan 2025 12:07:59 +0100 Subject: [PATCH 04/33] feat: unresolved fields --- datafusion/sql/src/expr/mod.rs | 129 ++++++++++++++++++++++++++++++++- 1 file changed, 127 insertions(+), 2 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 9b40ebdaf6a5..989c19b4042b 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,11 +15,21 @@ // specific language governing permissions and limitations // under the License. +use std::ops::ControlFlow; + use arrow_schema::DataType; use arrow_schema::TimeUnit; +use datafusion_common::Column; +use datafusion_common::DataFusionError; +use datafusion_common::Diagnostic; +use datafusion_common::SchemaError; use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; +use datafusion_expr::utils::find_column_exprs; +use sqlparser::ast::Spanned; +use sqlparser::ast::Visit; +use sqlparser::ast::Visitor; use sqlparser::ast::{ BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, @@ -36,7 +46,9 @@ use datafusion_expr::{ lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal, Operator, TryCast, }; +use sqlparser::tokenizer::Span; +use crate::planner::IdentNormalizer; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; mod binary_op; @@ -165,13 +177,126 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { - let mut expr = self.sql_expr_to_logical_expr(sql, schema, planner_context)?; + // The location of the original SQL expression in the source code + let mut expr = + self.sql_expr_to_logical_expr(sql.clone(), schema, planner_context)?; expr = self.rewrite_partial_qualifier(expr, schema); - self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr))?; + let validation_result = + self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr)); + + // Must do it here because `validate_schema_satisfies_exprs` doesn't + // have access to the original SQL expression from the parser + validation_result.map_err(|err| { + if let DataFusionError::SchemaError( + SchemaError::FieldNotFound { + field, + valid_fields: _, + }, + _, + ) = &err + { + let diagnostic = self.get_field_not_found_diagnostic( + &sql, + field, + schema, + planner_context, + ); + err.with_diagnostic(diagnostic) + } else { + err + } + })?; + let (expr, _) = expr.infer_placeholder_types(schema)?; Ok(expr) } + /// Given an unresolved field in an expression, returns a [`Diagnostic`] + fn get_field_not_found_diagnostic( + &self, + expr: &SQLExpr, + unresolved_field: &Column, + schema: &DFSchema, + planner_context: &mut PlannerContext, + ) -> Diagnostic { + // Given a SQL expression like SELECT color, megabytes FROM fruit, where + // we assume that the 'megabytes' column doesn't exist in table 'fruit', + // we find that the logical expression Expr::Column(Column('megabytes')) + // is unresolved. Though, because we don't store `sqlparser::Span` in + // `datafusion::Expr`, we have no simple way to find where 'megabytes' + // appears in the query. + // + // Instead, we have to walk down the tree of subexpressions + // `sqlparser::Expr` rooted in the parameter `expr: sqlparser::Expr` to + // find a node that matches the unresolved one. + struct UnresolvedFieldFinder<'a, S: ContextProvider> { + unresolved_field: &'a Column, + sql_to_rel: &'a SqlToRel<'a, S>, + schema: &'a DFSchema, + planner_context: &'a mut PlannerContext, + } + impl Visitor for UnresolvedFieldFinder<'_, S> { + type Break = Span; + + fn post_visit_expr( + &mut self, + sql_expr: &SQLExpr, + ) -> ControlFlow { + let (logical_expr, span) = match sql_expr { + SQLExpr::Identifier(ident) => ( + self.sql_to_rel + .sql_identifier_to_expr( + ident.clone(), + self.schema, + self.planner_context, + ) + .ok(), + sql_expr.span(), + ), + SQLExpr::CompoundIdentifier(idents) => ( + self.sql_to_rel + .sql_compound_identifier_to_expr( + idents.clone(), + self.schema, + self.planner_context, + ) + .ok(), + sql_expr.span(), + ), + _ => (None, Span::empty()), + }; + match logical_expr { + Some(Expr::Column(col)) if &col == self.unresolved_field => { + ControlFlow::Break(span) + } + _ => ControlFlow::Continue(()), + } + } + } + let mut visitor = UnresolvedFieldFinder { + unresolved_field: &unresolved_field, + sql_to_rel: self, + schema, + planner_context, + }; + let span = match expr.visit(&mut visitor) { + ControlFlow::Break(span) => Some(span), + ControlFlow::Continue(_) => None, + }; + + if let Some(relation) = &unresolved_field.relation { + Diagnostic::new().with_error( + format!("column '{}' not found in '{}'", &unresolved_field.name, relation.to_string()), + span.unwrap_or(Span::empty()), + ) + } else { + Diagnostic::new().with_error( + format!("column '{}' not found", &unresolved_field.name), + span.unwrap_or(Span::empty()), + ) + } + } + /// Rewrite aliases which are not-complete (e.g. ones that only include only table qualifier in a schema.table qualified relation) fn rewrite_partial_qualifier(&self, expr: Expr, schema: &DFSchema) -> Expr { match expr { From 91f4e3c72512be6574466cd2ad534173d9c21101 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 17 Jan 2025 13:36:29 +0100 Subject: [PATCH 05/33] feat: union with different number of columns --- datafusion/sql/src/set_expr.rs | 59 +++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 4 deletions(-) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 75fdbd20e840..4ce84faf7a37 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -16,9 +16,12 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, Result}; +use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result}; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier}; +use sqlparser::{ + ast::{SetExpr, SetOperator, SetQuantifier, Spanned}, + tokenizer::Span, +}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -27,6 +30,7 @@ impl SqlToRel<'_, S> { set_expr: SetExpr, planner_context: &mut PlannerContext, ) -> Result { + let set_expr_span = set_expr.span(); match set_expr { SetExpr::Select(s) => self.select_to_plan(*s, vec![], planner_context), SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context), @@ -36,8 +40,17 @@ impl SqlToRel<'_, S> { right, set_quantifier, } => { - let left_plan = self.set_expr_to_plan(*left, planner_context)?; - let right_plan = self.set_expr_to_plan(*right, planner_context)?; + let left_plan = self.set_expr_to_plan(*left.clone(), planner_context)?; + let right_plan = + self.set_expr_to_plan(*right.clone(), planner_context)?; + self.validate_set_expr_num_of_columns( + op, + &left, + &right, + &left_plan, + &right_plan, + set_expr_span, + )?; self.set_operation_to_plan(op, left_plan, right_plan, set_quantifier) } SetExpr::Query(q) => self.query_to_plan(*q, planner_context), @@ -61,6 +74,44 @@ impl SqlToRel<'_, S> { } } + fn validate_set_expr_num_of_columns( + &self, + op: SetOperator, + left: &SetExpr, + right: &SetExpr, + left_plan: &LogicalPlan, + right_plan: &LogicalPlan, + set_expr_span: Span, + ) -> Result<()> { + if left_plan.schema().fields().len() == right_plan.schema().fields().len() { + return Ok(()); + } + + plan_err!("{} queries have different number of columns", op).map_err(|err| { + err.with_diagnostic( + Diagnostic::new() + .with_error( + format!("{} queries have different number of columns", op), + set_expr_span, + ) + .with_note( + format!( + "this side has {} fields", + left_plan.schema().fields().len() + ), + left.span(), + ) + .with_note( + format!( + "this side has {} fields", + right_plan.schema().fields().len() + ), + right.span(), + ), + ) + }) + } + pub(super) fn set_operation_to_plan( &self, op: SetOperator, From 85db8af02ea2bd5065cf109af8f4b72292ac9832 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 12:38:22 +0100 Subject: [PATCH 06/33] feat: `Spans` --- datafusion/common/Cargo.toml | 1 + datafusion/common/src/column.rs | 18 ++- datafusion/common/src/config.rs | 5 + datafusion/common/src/dfschema.rs | 7 +- datafusion/common/src/lib.rs | 2 + datafusion/common/src/spans.rs | 77 ++++++++++++ .../core/src/execution/session_state.rs | 1 + datafusion/expr/src/expr.rs | 2 +- datafusion/expr/src/expr_rewriter/mod.rs | 7 +- datafusion/expr/src/expr_rewriter/order_by.rs | 5 +- .../src/analyzer/expand_wildcard_rule.rs | 2 + .../optimizer/src/analyzer/type_coercion.rs | 1 + datafusion/optimizer/src/decorrelate.rs | 2 +- datafusion/sql/src/expr/identifier.rs | 40 +++++-- datafusion/sql/src/expr/mod.rs | 113 +----------------- datafusion/sql/src/planner.rs | 23 +++- datafusion/sql/src/unparser/plan.rs | 2 + 17 files changed, 165 insertions(+), 143 deletions(-) create mode 100644 datafusion/common/src/spans.rs diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index feba589082b0..d4b4eee27e5d 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -68,6 +68,7 @@ pyo3 = { version = "0.22.0", optional = true } recursive = { workspace = true, optional = true } sqlparser = { workspace = true } tokio = { workspace = true } +smallvec = "1.13.2" [target.'cfg(target_family = "wasm")'.dependencies] web-time = "1.1.0" diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index fdde3d69eddb..f4bb0f8e6461 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -21,7 +21,7 @@ use arrow_schema::{Field, FieldRef}; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; -use crate::{DFSchema, Result, SchemaError, TableReference}; +use crate::{DFSchema, Result, SchemaError, Spans, TableReference}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; @@ -34,6 +34,7 @@ pub struct Column { pub relation: Option, /// field/column name. pub name: String, + pub spans: Spans, } impl Column { @@ -50,6 +51,7 @@ impl Column { Self { relation: relation.map(|r| r.into()), name: name.into(), + spans: Spans::new(), } } @@ -58,6 +60,7 @@ impl Column { Self { relation: None, name: name.into(), + spans: Spans::new(), } } @@ -68,6 +71,7 @@ impl Column { Self { relation: None, name: name.into(), + spans: Spans::new(), } } @@ -99,7 +103,7 @@ impl Column { // identifiers will be treated as an unqualified column name _ => return None, }; - Some(Self { relation, name }) + Some(Self { relation, name, spans: Spans::new() }) } /// Deserialize a fully qualified name string into a column @@ -113,6 +117,7 @@ impl Column { Self { relation: None, name: flat_name, + spans: Spans::new(), }, ) } @@ -124,6 +129,7 @@ impl Column { Self { relation: None, name: flat_name, + spans: Spans::new(), }, ) } @@ -254,6 +260,14 @@ impl Column { .collect(), }) } + + pub fn spans(&self) -> &Spans { + &self.spans + } + + pub fn spans_mut(&mut self) -> &mut Spans { + &mut self.spans + } } impl From<&str> for Column { diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 33a90017bd7e..5fd2fe69c6c9 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -236,6 +236,11 @@ config_namespace! { /// specified. The Arrow type system does not have a notion of maximum /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true + + /// When set to true, the source locations relative to the original SQL + /// query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected + /// and recorded in the logical plan nodes. + pub collect_spans: bool, default = false } } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 302d515e027e..e3f36fb17af5 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -502,10 +502,9 @@ impl DFSchema { Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) } else { _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: None, - name: name.to_string(), - }, + field: Column::new_unqualified( + name.to_string(), + ), }) } } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 02d4a26b1591..565d6064ee62 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -48,6 +48,7 @@ pub mod tree_node; pub mod types; pub mod utils; pub mod diagnostic; +pub mod spans; /// Reexport arrow crate pub use arrow; @@ -78,6 +79,7 @@ pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; +pub use spans::Spans; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. // These are just re-exports of macros by the same name, which gets around the 'cannot refer to diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs new file mode 100644 index 000000000000..bfec00fa673f --- /dev/null +++ b/datafusion/common/src/spans.rs @@ -0,0 +1,77 @@ +use smallvec::SmallVec; +use sqlparser::tokenizer::Span; +use std::cmp::Ordering; +use std::hash::{Hash, Hasher}; + +// / A collection of [`Span`], meant to be used as a field of entities whose +/// location in the original SQL query is desired to be tracked. Sometimes an +/// entity can have multiple spans. e.g. if you want to track the position of +/// the column a that comes from SELECT 1 AS a UNION ALL SELECT 2 AS a you'll +/// need two spans. +#[derive(Debug, Clone)] + // Store teh first [`Span`] on the stack because that is by far the most common + // case. More will spill onto the heap. +pub struct Spans(pub SmallVec<[Span; 1]>); + +impl Spans { + pub fn new() -> Self { + Spans(SmallVec::new()) + } + + pub fn first_or_empty(&self) -> Span { + self.0.get(0).copied().unwrap_or(Span::empty()) + } + + pub fn get_spans(&self) -> &[Span] { + &self.0 + } + + pub fn add_span(&mut self, span: Span) { + self.0.push(span); + } +} + +impl Default for Spans { + fn default() -> Self { + Self::new() + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl PartialEq for Spans { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Eq for Spans {} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl PartialOrd for Spans { + fn partial_cmp(&self, _other: &Self) -> Option { + Some(Ordering::Equal) + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Ord for Spans { + fn cmp(&self, _other: &Self) -> Ordering { + Ordering::Equal + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Hash for Spans { + fn hash(&self, _state: &mut H) {} +} diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index c5874deb6ed5..1c5fd4187df2 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -579,6 +579,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, + collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index b8e495ee7ae9..d0e3d4710105 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1111,7 +1111,7 @@ impl Expr { /// output schema. We can use this qualified name to reference the field. pub fn qualified_name(&self) -> (Option, String) { match self { - Expr::Column(Column { relation, name }) => (relation.clone(), name.clone()), + Expr::Column(Column { relation, name, spans: _ }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 335d5587bee7..6746fba7870c 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -157,10 +157,7 @@ pub fn unnormalize_col(expr: Expr) -> Expr { expr.transform(|expr| { Ok({ if let Expr::Column(c) = expr { - let col = Column { - relation: None, - name: c.name, - }; + let col = Column::new_unqualified(c.name); Transformed::yes(Expr::Column(col)) } else { Transformed::no(expr) @@ -181,7 +178,7 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { relation: _, name }) => Ok(Column::new( + Expr::Column(Column { relation: _, name, spans }) => Ok(Column::new( Some::(subqry_alias.into()), name, )), diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index f0d3d8fcd0c1..0044b6cf6f37 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -98,10 +98,7 @@ fn rewrite_in_terms_of_projection( // for a column with the same "MIN(C2)", so translate there let name = normalized_expr.schema_name().to_string(); - let search_col = Expr::Column(Column { - relation: None, - name, - }); + let search_col = Expr::Column(Column::new_unqualified(name)); // look for the column named the same as this expr if let Some(found) = proj_exprs.iter().find(|a| expr_match(&search_col, a)) { diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 9fbe54e1ccb9..e887e3c50e6f 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -123,6 +123,8 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { Expr::Column(Column { ref relation, ref name, + // TODO Should we use these spans? + spans: _ }) => { if name.eq("*") { if let Some(qualifier) = relation { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 48a5e2f9a07c..9fc282a8e235 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -1000,6 +1000,7 @@ fn project_with_column_index( Expr::Column(Column { relation: _, ref name, + spans: _, }) if name != schema.field(i).name() => Ok(e.alias(schema.field(i).name())), Expr::Alias { .. } | Expr::Column { .. } => Ok(e), Expr::Wildcard { .. } => { diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index ee6ea08b43bf..50c24949d766 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -533,7 +533,7 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name }) => name.to_string(), + Expr::Column(Column { relation: _, name, spans: _ }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9adf14459081..3ef6d8a5805a 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -16,14 +16,14 @@ // under the License. use arrow_schema::Field; -use sqlparser::ast::{Expr as SQLExpr, Ident}; - use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DataFusionError, Result, TableReference, }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; +use sqlparser::ast::{Expr as SQLExpr, Ident, Spanned}; +use sqlparser::tokenizer::Span; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_expr::UNNAMED_TABLE; @@ -35,6 +35,7 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { + let id_span = id.span; if id.value.starts_with('@') { // TODO: figure out if ScalarVariables should be insensitive. let var_names = vec![id.value]; @@ -56,10 +57,14 @@ impl SqlToRel<'_, S> { if let Ok((qualifier, _)) = schema.qualified_field_with_unqualified_name(normalize_ident.as_str()) { - return Ok(Expr::Column(Column { - relation: qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), - name: normalize_ident, - })); + let mut column = Column::new( + qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), + normalize_ident, + ); + if self.options.collect_spans { + column.spans_mut().add_span(id_span); + } + return Ok(Expr::Column(column)); } // Check the outer query schema @@ -76,10 +81,11 @@ impl SqlToRel<'_, S> { } // Default case - Ok(Expr::Column(Column { - relation: None, - name: normalize_ident, - })) + let mut column = Column::new_unqualified(normalize_ident); + if self.options.collect_spans { + column.spans_mut().add_span(id_span); + } + Ok(Expr::Column(column)) } } @@ -93,6 +99,8 @@ impl SqlToRel<'_, S> { return internal_err!("Not a compound identifier: {ids:?}"); } + let ids_span = Span::union_iter(ids.iter().map(|id| id.span)); + if ids[0].value.starts_with('@') { let var_names: Vec<_> = ids .into_iter() @@ -136,7 +144,11 @@ impl SqlToRel<'_, S> { } // Found matching field with no spare identifier(s) Some((field, qualifier, _nested_names)) => { - Ok(Expr::Column(Column::from((qualifier, field)))) + let mut column = Column::from((qualifier, field)); + if self.options.collect_spans { + column.spans_mut().add_span(ids_span); + } + Ok(Expr::Column(column)) } None => { // Return default where use all identifiers to not have a nested field @@ -179,7 +191,11 @@ impl SqlToRel<'_, S> { let s = &ids[0..ids.len()]; // Safe unwrap as s can never be empty or exceed the bounds let (relation, column_name) = form_identifier(s).unwrap(); - Ok(Expr::Column(Column::new(relation, column_name))) + let mut column = Column::new(relation, column_name); + if self.options.collect_spans { + column.spans_mut().add_span(ids_span); + } + Ok(Expr::Column(column)) } } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 989c19b4042b..dac176e5ffc0 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -181,122 +181,11 @@ impl SqlToRel<'_, S> { let mut expr = self.sql_expr_to_logical_expr(sql.clone(), schema, planner_context)?; expr = self.rewrite_partial_qualifier(expr, schema); - let validation_result = - self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr)); - - // Must do it here because `validate_schema_satisfies_exprs` doesn't - // have access to the original SQL expression from the parser - validation_result.map_err(|err| { - if let DataFusionError::SchemaError( - SchemaError::FieldNotFound { - field, - valid_fields: _, - }, - _, - ) = &err - { - let diagnostic = self.get_field_not_found_diagnostic( - &sql, - field, - schema, - planner_context, - ); - err.with_diagnostic(diagnostic) - } else { - err - } - })?; - + self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr))?; let (expr, _) = expr.infer_placeholder_types(schema)?; Ok(expr) } - /// Given an unresolved field in an expression, returns a [`Diagnostic`] - fn get_field_not_found_diagnostic( - &self, - expr: &SQLExpr, - unresolved_field: &Column, - schema: &DFSchema, - planner_context: &mut PlannerContext, - ) -> Diagnostic { - // Given a SQL expression like SELECT color, megabytes FROM fruit, where - // we assume that the 'megabytes' column doesn't exist in table 'fruit', - // we find that the logical expression Expr::Column(Column('megabytes')) - // is unresolved. Though, because we don't store `sqlparser::Span` in - // `datafusion::Expr`, we have no simple way to find where 'megabytes' - // appears in the query. - // - // Instead, we have to walk down the tree of subexpressions - // `sqlparser::Expr` rooted in the parameter `expr: sqlparser::Expr` to - // find a node that matches the unresolved one. - struct UnresolvedFieldFinder<'a, S: ContextProvider> { - unresolved_field: &'a Column, - sql_to_rel: &'a SqlToRel<'a, S>, - schema: &'a DFSchema, - planner_context: &'a mut PlannerContext, - } - impl Visitor for UnresolvedFieldFinder<'_, S> { - type Break = Span; - - fn post_visit_expr( - &mut self, - sql_expr: &SQLExpr, - ) -> ControlFlow { - let (logical_expr, span) = match sql_expr { - SQLExpr::Identifier(ident) => ( - self.sql_to_rel - .sql_identifier_to_expr( - ident.clone(), - self.schema, - self.planner_context, - ) - .ok(), - sql_expr.span(), - ), - SQLExpr::CompoundIdentifier(idents) => ( - self.sql_to_rel - .sql_compound_identifier_to_expr( - idents.clone(), - self.schema, - self.planner_context, - ) - .ok(), - sql_expr.span(), - ), - _ => (None, Span::empty()), - }; - match logical_expr { - Some(Expr::Column(col)) if &col == self.unresolved_field => { - ControlFlow::Break(span) - } - _ => ControlFlow::Continue(()), - } - } - } - let mut visitor = UnresolvedFieldFinder { - unresolved_field: &unresolved_field, - sql_to_rel: self, - schema, - planner_context, - }; - let span = match expr.visit(&mut visitor) { - ControlFlow::Break(span) => Some(span), - ControlFlow::Continue(_) => None, - }; - - if let Some(relation) = &unresolved_field.relation { - Diagnostic::new().with_error( - format!("column '{}' not found in '{}'", &unresolved_field.name, relation.to_string()), - span.unwrap_or(Span::empty()), - ) - } else { - Diagnostic::new().with_error( - format!("column '{}' not found", &unresolved_field.name), - span.unwrap_or(Span::empty()), - ) - } - } - /// Rewrite aliases which are not-complete (e.g. ones that only include only table qualifier in a schema.table qualified relation) fn rewrite_partial_qualifier(&self, expr: Expr, schema: &DFSchema) -> Expr { match expr { diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 628bcb2fbdcd..0cba4da7c3e9 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -22,7 +22,8 @@ use std::vec; use arrow_schema::*; use datafusion_common::{ - field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, + field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic, + SchemaError, }; use sqlparser::ast::TimezoneInfo; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; @@ -42,12 +43,13 @@ use crate::utils::make_decimal_type; pub use datafusion_expr::planner::ContextProvider; /// SQL parser options -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct ParserOptions { pub parse_float_as_decimal: bool, pub enable_ident_normalization: bool, pub support_varchar_with_length: bool, pub enable_options_value_normalization: bool, + pub collect_spans: bool, } impl Default for ParserOptions { @@ -57,6 +59,7 @@ impl Default for ParserOptions { enable_ident_normalization: true, support_varchar_with_length: true, enable_options_value_normalization: false, + collect_spans: false, } } } @@ -366,7 +369,23 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } .map_err(|_: DataFusionError| { + let diagnostic = if let Some(relation) = &col.relation { + Diagnostic::new().with_error( + format!( + "column '{}' not found in '{}'", + &col.name, + relation.to_string() + ), + col.spans().first_or_empty(), + ) + } else { + Diagnostic::new().with_error( + format!("column '{}' not found", &col.name), + col.spans().first_or_empty(), + ) + }; field_not_found(col.relation.clone(), col.name.as_str(), schema) + .with_diagnostic(diagnostic) }), _ => internal_err!("Not a column"), }) diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 2e02baa4f7f5..368131751e91 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -1096,10 +1096,12 @@ impl Unparser<'_> { Expr::Column(Column { relation: _, name: left_name, + spans: _, }), Expr::Column(Column { relation: _, name: right_name, + spans: _, }), ) if left_name == right_name => { idents.push(self.new_ident_quoted_if_needs(left_name.to_string())); From 3f4ddd2696d83430f5e5110a598abe514c9d2cba Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 13:00:59 +0100 Subject: [PATCH 07/33] feat: adjust field not found error --- datafusion/sql/src/planner.rs | 52 +++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 23 deletions(-) diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 0cba4da7c3e9..83c75b5548b1 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -356,36 +356,42 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .iter() .try_for_each(|col| match col { Expr::Column(col) => match &col.relation { - Some(r) => { - schema.field_with_qualified_name(r, &col.name)?; - Ok(()) - } + Some(r) => schema.field_with_qualified_name(r, &col.name).map(|_| ()), None => { if !schema.fields_with_unqualified_name(&col.name).is_empty() { Ok(()) } else { - Err(unqualified_field_not_found(col.name.as_str(), schema)) + Err(field_not_found( + col.relation.clone(), + col.name.as_str(), + schema, + )) } } } - .map_err(|_: DataFusionError| { - let diagnostic = if let Some(relation) = &col.relation { - Diagnostic::new().with_error( - format!( - "column '{}' not found in '{}'", - &col.name, - relation.to_string() - ), - col.spans().first_or_empty(), - ) - } else { - Diagnostic::new().with_error( - format!("column '{}' not found", &col.name), - col.spans().first_or_empty(), - ) - }; - field_not_found(col.relation.clone(), col.name.as_str(), schema) - .with_diagnostic(diagnostic) + .map_err(|err: DataFusionError| match &err { + DataFusionError::SchemaError( + SchemaError::FieldNotFound { .. }, + _, + ) => { + let diagnostic = if let Some(relation) = &col.relation { + Diagnostic::new().with_error( + format!( + "column '{}' not found in '{}'", + &col.name, + relation.to_string() + ), + col.spans().first_or_empty(), + ) + } else { + Diagnostic::new().with_error( + format!("column '{}' not found", &col.name), + col.spans().first_or_empty(), + ) + }; + err.with_diagnostic(diagnostic) + } + _ => err, }), _ => internal_err!("Not a column"), }) From b4df859a5455821563fb3ea0aaa0e179339e6951 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 13:51:51 +0100 Subject: [PATCH 08/33] feat: incompatible types in binary expr --- datafusion/common/src/spans.rs | 4 + datafusion/expr-common/Cargo.toml | 1 + .../expr-common/src/interval_arithmetic.rs | 15 +- .../expr-common/src/type_coercion/binary.rs | 172 ++++++++++++------ datafusion/expr/src/expr.rs | 9 +- datafusion/expr/src/expr_schema.rs | 18 +- .../optimizer/src/analyzer/type_coercion.rs | 13 +- .../physical-expr/src/expressions/binary.rs | 11 +- 8 files changed, 163 insertions(+), 80 deletions(-) diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index bfec00fa673f..0b6e7d00fd5d 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -29,6 +29,10 @@ impl Spans { pub fn add_span(&mut self, span: Span) { self.0.push(span); } + + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } } impl Default for Spans { diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index 1ccc6fc17293..533a12ced00b 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -40,6 +40,7 @@ path = "src/lib.rs" arrow = { workspace = true } datafusion-common = { workspace = true } itertools = { workspace = true } +sqlparser = { workspace = true } [dev-dependencies] paste = "^1.0" diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index ffaa32f08075..2b43ed5848d1 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -18,7 +18,7 @@ //! Interval arithmetic library use crate::operator::Operator; -use crate::type_coercion::binary::get_result_type; +use crate::type_coercion::binary::BinaryTypeCoercer; use std::borrow::Borrow; use std::fmt::{self, Display, Formatter}; use std::ops::{AddAssign, SubAssign}; @@ -518,7 +518,10 @@ impl Interval { /// to an error. pub fn equal>(&self, other: T) -> Result { let rhs = other.borrow(); - if get_result_type(&self.data_type(), &Operator::Eq, &rhs.data_type()).is_err() { + if BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type()) + .get_result_type() + .is_err() + { internal_err!( "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}", self.data_type(), @@ -689,7 +692,9 @@ impl Interval { /// choose single values arbitrarily from each of the operands. pub fn add>(&self, other: T) -> Result { let rhs = other.borrow(); - let dt = get_result_type(&self.data_type(), &Operator::Plus, &rhs.data_type())?; + let dt = + BinaryTypeCoercer::new(&self.data_type(), &Operator::Plus, &rhs.data_type()) + .get_result_type()?; Ok(Self::new( add_bounds::(&dt, &self.lower, &rhs.lower), @@ -704,7 +709,9 @@ impl Interval { /// each of the operands. pub fn sub>(&self, other: T) -> Result { let rhs = other.borrow(); - let dt = get_result_type(&self.data_type(), &Operator::Minus, &rhs.data_type())?; + let dt = + BinaryTypeCoercer::new(&self.data_type(), &Operator::Minus, &rhs.data_type()) + .get_result_type()?; Ok(Self::new( sub_bounds::(&dt, &self.lower, &rhs.upper), diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index c775d3131692..3c0a7f15a385 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -30,9 +30,11 @@ use arrow::datatypes::{ }; use datafusion_common::types::NativeType; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, Result, + exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, + Diagnostic, Result, Spans, }; use itertools::Itertools; +use sqlparser::tokenizer::Span; /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` @@ -68,11 +70,45 @@ impl Signature { } } -/// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` -fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result { - use arrow::datatypes::DataType::*; - use Operator::*; - match op { +pub struct BinaryTypeCoercer<'a> { + lhs: &'a DataType, + op: &'a Operator, + rhs: &'a DataType, + + lhs_spans: Spans, + op_spans: Spans, + rhs_spans: Spans, +} + +impl<'a> BinaryTypeCoercer<'a> { + pub fn new(lhs: &'a DataType, op: &'a Operator, rhs: &'a DataType) -> Self { + Self { + lhs, + op, + rhs, + lhs_spans: Spans::new(), + op_spans: Spans::new(), + rhs_spans: Spans::new(), + } + } + + pub fn set_lhs_spans(&mut self, spans: Spans) { + self.lhs_spans = spans; + } + + pub fn set_op_spans(&mut self, spans: Spans) { + self.op_spans = spans; + } + + pub fn set_rhs_spans(&mut self, spans: Spans) { + self.rhs_spans = spans; + } + + /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` + fn signature(&'a self) -> Result { + use arrow::datatypes::DataType::*; + use Operator::*; + match self.op { Eq | NotEq | Lt | @@ -81,46 +117,49 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result GtEq | IsDistinctFrom | IsNotDistinctFrom => { - comparison_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + comparison_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for comparison operation {lhs} {op} {rhs}" + "Cannot infer common argument type for comparison operation {} {} {}", + self.lhs, + self.op, + self.rhs ) }) } - And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) { + And | Or => if matches!((self.lhs, self.rhs), (Boolean | Null, Boolean | Null)) { // Logical binary boolean operators can only be evaluated for // boolean or null arguments. Ok(Signature::uniform(Boolean)) } else { plan_err!( - "Cannot infer common argument type for logical boolean operation {lhs} {op} {rhs}" + "Cannot infer common argument type for logical boolean operation {} {} {}", self.lhs, self.op, self.rhs ) } RegexMatch | RegexIMatch | RegexNotMatch | RegexNotIMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" + "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs ) }) } LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" + "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs ) }) } BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => { - bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + bitwise_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common type for bitwise operation {lhs} {op} {rhs}" + "Cannot infer common type for bitwise operation {} {} {}", self.lhs, self.op, self.rhs ) }) } StringConcat => { - string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + string_concat_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common string type for string concat operation {lhs} {op} {rhs}" + "Cannot infer common string type for string concat operation {} {} {}", self.lhs, self.op, self.rhs ) }) } @@ -128,9 +167,9 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result // ArrowAt and AtArrow check for whether one array is contained in another. // The result type is boolean. Signature::comparison defines this signature. // Operation has nothing to do with comparison - array_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + array_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common array type for arrow operation {lhs} {op} {rhs}" + "Cannot infer common array type for arrow operation {} {} {}", self.lhs, self.op, self.rhs ) }) } @@ -140,7 +179,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result let l = new_empty_array(lhs); let r = new_empty_array(rhs); - let result = match op { + let result = match self.op { Plus => add_wrapping(&l, &r), Minus => sub_wrapping(&l, &r), Multiply => mul_wrapping(&l, &r), @@ -151,19 +190,19 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result result.map(|x| x.data_type().clone()) }; - if let Ok(ret) = get_result(lhs, rhs) { + if let Ok(ret) = get_result(self.lhs, self.rhs) { // Temporal arithmetic, e.g. Date32 + Interval Ok(Signature{ - lhs: lhs.clone(), - rhs: rhs.clone(), + lhs: self.lhs.clone(), + rhs: self.rhs.clone(), ret, }) - } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) { + } else if let Some(coerced) = temporal_coercion_strict_timezone(self.lhs, self.rhs) { // Temporal arithmetic by first coercing to a common time representation // e.g. Date32 - Timestamp let ret = get_result(&coerced, &coerced).map_err(|e| { plan_datafusion_err!( - "Cannot get result type for temporal operation {coerced} {op} {coerced}: {e}" + "Cannot get result type for temporal operation {coerced} {} {coerced}: {e}", self.op ) })?; Ok(Signature{ @@ -171,11 +210,11 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs: coerced, ret, }) - } else if let Some((lhs, rhs)) = math_decimal_coercion(lhs, rhs) { + } else if let Some((lhs, rhs)) = math_decimal_coercion(self.lhs, self.rhs) { // Decimal arithmetic, e.g. Decimal(10, 2) + Decimal(10, 0) let ret = get_result(&lhs, &rhs).map_err(|e| { plan_datafusion_err!( - "Cannot get result type for decimal operation {lhs} {op} {rhs}: {e}" + "Cannot get result type for decimal operation {} {} {}: {e}", self.lhs, self.op, self.rhs ) })?; Ok(Signature{ @@ -183,36 +222,43 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs, ret, }) - } else if let Some(numeric) = mathematics_numerical_coercion(lhs, rhs) { + } else if let Some(numeric) = mathematics_numerical_coercion(self.lhs, self.rhs) { // Numeric arithmetic, e.g. Int32 + Int32 Ok(Signature::uniform(numeric)) } else { plan_err!( - "Cannot coerce arithmetic expression {lhs} {op} {rhs} to valid types" - ) + "Cannot coerce arithmetic expression {} {} {} to valid types", self.lhs, self.op, self.rhs + ).map_err(|err| { + let binary_expr_span = Span::union_iter( + [ + self.lhs_spans.first_or_empty(), + self.rhs_spans.first_or_empty() + ].iter().copied().filter(|span| span != &Span::empty()) + ); + let diagnostic = Diagnostic::new() + .with_error("expressions have incompatible types", binary_expr_span) + .with_note(format!("has type {}", self.lhs), self.lhs_spans.first_or_empty()) + .with_note(format!("has type {}", self.rhs), self.rhs_spans.first_or_empty()); + err.with_diagnostic(diagnostic) + }) } } } -} + } -/// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types -pub fn get_result_type( - lhs: &DataType, - op: &Operator, - rhs: &DataType, -) -> Result { - signature(lhs, op, rhs).map(|sig| sig.ret) -} + /// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types + pub fn get_result_type(&'a self) -> Result { + self.signature().map(|sig| sig.ret) + } -/// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types -pub fn get_input_types( - lhs: &DataType, - op: &Operator, - rhs: &DataType, -) -> Result<(DataType, DataType)> { - signature(lhs, op, rhs).map(|sig| (sig.lhs, sig.rhs)) + /// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types + pub fn get_input_types(&'a self) -> Result<(DataType, DataType)> { + self.signature().map(|sig| (sig.lhs, sig.rhs)) + } } +// TODO Move the rest inside of BinaryTypeCoercer + /// Coercion rules for mathematics operators between decimal and non-decimal types. fn math_decimal_coercion( lhs_type: &DataType, @@ -1477,8 +1523,9 @@ mod tests { #[test] fn test_coercion_error() -> Result<()> { - let result_type = - get_input_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8); + let coercer = + BinaryTypeCoercer::new(&DataType::Float32, &Operator::Plus, &DataType::Utf8); + let result_type = coercer.get_input_types(); let e = result_type.unwrap_err(); assert_eq!(e.strip_backtrace(), "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types"); @@ -1521,14 +1568,16 @@ mod tests { for (i, input_type) in input_types.iter().enumerate() { let expect_type = &result_types[i]; for op in comparison_op_types { - let (lhs, rhs) = get_input_types(&input_decimal, &op, input_type)?; + let (lhs, rhs) = BinaryTypeCoercer::new(&input_decimal, &op, input_type) + .get_input_types()?; assert_eq!(expect_type, &lhs); assert_eq!(expect_type, &rhs); } } // negative test let result_type = - get_input_types(&input_decimal, &Operator::Eq, &DataType::Boolean); + BinaryTypeCoercer::new(&input_decimal, &Operator::Eq, &DataType::Boolean) + .get_input_types(); assert!(result_type.is_err()); Ok(()) } @@ -1621,7 +1670,8 @@ mod tests { /// the result type is `$RESULT_TYPE` macro_rules! test_coercion_binary_rule { ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{ - let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?; + let (lhs, rhs) = + BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &$RHS_TYPE).get_input_types()?; assert_eq!(lhs, $RESULT_TYPE); assert_eq!(rhs, $RESULT_TYPE); }}; @@ -1647,17 +1697,20 @@ mod tests { #[test] fn test_date_timestamp_arithmetic_error() -> Result<()> { - let (lhs, rhs) = get_input_types( + let (lhs, rhs) = BinaryTypeCoercer::new( &DataType::Timestamp(TimeUnit::Nanosecond, None), &Operator::Minus, &DataType::Timestamp(TimeUnit::Millisecond, None), - )?; + ) + .get_input_types()?; assert_eq!(lhs.to_string(), "Timestamp(Millisecond, None)"); assert_eq!(rhs.to_string(), "Timestamp(Millisecond, None)"); - let err = get_input_types(&DataType::Date32, &Operator::Plus, &DataType::Date64) - .unwrap_err() - .to_string(); + let err = + BinaryTypeCoercer::new(&DataType::Date32, &Operator::Plus, &DataType::Date64) + .get_input_types() + .unwrap_err() + .to_string(); assert_contains!( &err, @@ -2179,11 +2232,12 @@ mod tests { DataType::Timestamp(TimeUnit::Microsecond, None), true, )); - let result_type = get_input_types( + let result_type = BinaryTypeCoercer::new( &DataType::List(Arc::clone(&inner_field)), &Operator::Eq, &DataType::List(Arc::clone(&inner_timestamp_field)), - ); + ) + .get_input_types(); assert!(result_type.is_err()); // TODO add other data type diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index d0e3d4710105..20b82a5e3754 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -35,7 +35,7 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeContainer, TreeNodeRecursion, }; use datafusion_common::{ - plan_err, Column, DFSchema, HashMap, Result, ScalarValue, TableReference, + plan_err, Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use sqlparser::ast::{ @@ -1663,6 +1663,13 @@ impl Expr { | Expr::Placeholder(..) => false, } } + + pub fn spans(&self) -> Option<&Spans> { + match self { + Expr::Column(col) => Some(&col.spans), + _ => None, + } + } } impl Normalizeable for Expr { diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 25073ca7eaaa..c5ac5f6d3013 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -20,7 +20,6 @@ use crate::expr::{ AggregateFunction, Alias, BinaryExpr, Cast, InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, }; -use crate::type_coercion::binary::get_result_type; use crate::type_coercion::functions::{ data_types_with_aggregate_udf, data_types_with_scalar_udf, data_types_with_window_udf, }; @@ -31,6 +30,7 @@ use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema, Result, TableReference, }; +use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use std::collections::HashMap; use std::sync::Arc; @@ -217,7 +217,12 @@ impl ExprSchemable for Expr { ref left, ref right, ref op, - }) => get_result_type(&left.get_type(schema)?, op, &right.get_type(schema)?), + }) => BinaryTypeCoercer::new( + &left.get_type(schema)?, + op, + &right.get_type(schema)?, + ) + .get_result_type(), Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean), Expr::Placeholder(Placeholder { data_type, .. }) => { if let Some(dtype) = data_type { @@ -408,9 +413,12 @@ impl ExprSchemable for Expr { ref right, ref op, }) => { - let left = left.data_type_and_nullable(schema)?; - let right = right.data_type_and_nullable(schema)?; - Ok((get_result_type(&left.0, op, &right.0)?, left.1 || right.1)) + let (lhs_type, lhs_nullable) = left.data_type_and_nullable(schema)?; + let (rhs_type, rhs_nullable) = right.data_type_and_nullable(schema)?; + let mut coercer = BinaryTypeCoercer::new(&lhs_type, op, &rhs_type); + coercer.set_lhs_spans(left.spans().cloned().unwrap_or_default()); + coercer.set_rhs_spans(right.spans().cloned().unwrap_or_default()); + Ok((coercer.get_result_type()?, lhs_nullable || rhs_nullable)) } Expr::WindowFunction(window_function) => { self.data_type_and_nullable_with_window_function(schema, window_function) diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 9fc282a8e235..226634edfa76 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -19,6 +19,7 @@ use std::sync::Arc; +use datafusion_expr::binary::BinaryTypeCoercer; use itertools::izip; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema}; @@ -38,9 +39,7 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::Subquery; -use datafusion_expr::type_coercion::binary::{ - comparison_coercion, get_input_types, like_coercion, -}; +use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion}; use datafusion_expr::type_coercion::functions::{ data_types_with_aggregate_udf, data_types_with_scalar_udf, }; @@ -278,11 +277,12 @@ impl<'a> TypeCoercionRewriter<'a> { op: Operator, right: Expr, ) -> Result<(Expr, Expr)> { - let (left_type, right_type) = get_input_types( + let (left_type, right_type) = BinaryTypeCoercer::new( &left.get_type(self.schema)?, &op, &right.get_type(self.schema)?, - )?; + ) + .get_input_types()?; Ok(( left.cast_to(&left_type, self.schema)?, right.cast_to(&right_type, self.schema)?, @@ -736,7 +736,8 @@ fn coerce_window_frame( // The above op will be rewrite to the binary op when creating the physical op. fn get_casted_expr_for_bool_op(expr: Expr, schema: &DFSchema) -> Result { let left_type = expr.get_type(schema)?; - get_input_types(&left_type, &Operator::IsDistinctFrom, &DataType::Boolean)?; + BinaryTypeCoercer::new(&left_type, &Operator::IsDistinctFrom, &DataType::Boolean) + .get_input_types()?; expr.cast_to(&DataType::Boolean, schema) } diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 2ab53b214d7f..ef07205068b2 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -33,9 +33,9 @@ use arrow::datatypes::*; use arrow_schema::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::binary::BinaryTypeCoercer; use datafusion_expr::interval_arithmetic::{apply_operator, Interval}; use datafusion_expr::sort_properties::ExprProperties; -use datafusion_expr::type_coercion::binary::get_result_type; use datafusion_expr::{ColumnarValue, Operator}; use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested}; @@ -287,11 +287,12 @@ impl PhysicalExpr for BinaryExpr { } fn data_type(&self, input_schema: &Schema) -> Result { - get_result_type( + BinaryTypeCoercer::new( &self.left.data_type(input_schema)?, &self.op, &self.right.data_type(input_schema)?, ) + .get_result_type() } fn nullable(&self, input_schema: &Schema) -> Result { @@ -741,7 +742,6 @@ mod tests { use super::*; use crate::expressions::{col, lit, try_cast, Column, Literal}; use datafusion_common::plan_datafusion_err; - use datafusion_expr::type_coercion::binary::get_input_types; /// Performs a binary operation, applying any type coercion necessary fn binary_op( @@ -752,7 +752,8 @@ mod tests { ) -> Result> { let left_type = left.data_type(schema)?; let right_type = right.data_type(schema)?; - let (lhs, rhs) = get_input_types(&left_type, &op, &right_type)?; + let (lhs, rhs) = + BinaryTypeCoercer::new(&left_type, &op, &right_type).get_input_types()?; let left_expr = try_cast(left, schema, lhs)?; let right_expr = try_cast(right, schema, rhs)?; @@ -856,7 +857,7 @@ mod tests { ]); let a = $A_ARRAY::from($A_VEC); let b = $B_ARRAY::from($B_VEC); - let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?; + let (lhs, rhs) = BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?; let left = try_cast(col("a", &schema)?, &schema, lhs)?; let right = try_cast(col("b", &schema)?, &schema, rhs)?; From e4652d70b8ff8a156bbbb076d3b0f9866e4e2efc Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 14:07:52 +0100 Subject: [PATCH 09/33] fix: tests not compiling --- datafusion/core/tests/execution/logical_plan.rs | 3 ++- datafusion/expr/src/logical_plan/builder.rs | 1 + datafusion/sql/src/unparser/expr.rs | 4 +++- datafusion/sql/tests/sql_integration.rs | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs index 168bf484e541..9f093ccf24b6 100644 --- a/datafusion/core/tests/execution/logical_plan.rs +++ b/datafusion/core/tests/execution/logical_plan.rs @@ -18,7 +18,7 @@ use arrow_array::Int64Array; use arrow_schema::{DataType, Field}; use datafusion::execution::session_state::SessionStateBuilder; -use datafusion_common::{Column, DFSchema, Result, ScalarValue}; +use datafusion_common::{Column, DFSchema, Result, ScalarValue, Spans}; use datafusion_execution::TaskContext; use datafusion_expr::expr::AggregateFunction; use datafusion_expr::logical_plan::{LogicalPlan, Values}; @@ -51,6 +51,7 @@ async fn count_only_nulls() -> Result<()> { let input_col_ref = Expr::Column(Column { relation: None, name: "col".to_string(), + spans: Spans::new(), }); // Aggregation: count(col) AS count diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index c7cff3ac26b1..323edb2f4442 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2212,6 +2212,7 @@ mod tests { Column { relation: Some(TableReference::Bare { table }), name, + spans: _, }, }, _, diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 7a110fd0785c..9b667935e5eb 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1618,7 +1618,7 @@ mod tests { use arrow::datatypes::{Field, Schema}; use arrow_schema::DataType::Int8; use ast::ObjectName; - use datafusion_common::TableReference; + use datafusion_common::{Spans, TableReference}; use datafusion_expr::expr::WildcardOptions; use datafusion_expr::{ case, cast, col, cube, exists, grouping_set, interval_datetime_lit, @@ -1699,6 +1699,7 @@ mod tests { Expr::Column(Column { relation: Some(TableReference::partial("a", "b")), name: "c".to_string(), + spans: Spans::new(), }) .gt(lit(4)), r#"(b.c > 4)"#, @@ -2030,6 +2031,7 @@ mod tests { expr: Box::new(Expr::Column(Column { relation: Some(TableReference::partial("schema", "table")), name: "array_col".to_string(), + spans: Spans::new(), })), }), r#"UNNEST("table".array_col)"#, diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index f6533dceca4f..360683cd71d5 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -94,6 +94,7 @@ fn parse_decimals() { enable_ident_normalization: false, support_varchar_with_length: false, enable_options_value_normalization: false, + collect_spans: false, }, ); } @@ -149,6 +150,7 @@ fn parse_ident_normalization() { enable_ident_normalization, support_varchar_with_length: false, enable_options_value_normalization: false, + collect_spans: false, }, ); if plan.is_ok() { From d2e70ad40773a1020f8aa9c229a852e55798aeaa Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 14:28:25 +0100 Subject: [PATCH 10/33] feat: ambiguous column --- datafusion/common/src/column.rs | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index f4bb0f8e6461..6a343443c2af 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -21,7 +21,7 @@ use arrow_schema::{Field, FieldRef}; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; -use crate::{DFSchema, Result, SchemaError, Spans, TableReference}; +use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; @@ -103,7 +103,11 @@ impl Column { // identifiers will be treated as an unqualified column name _ => return None, }; - Some(Self { relation, name, spans: Spans::new() }) + Some(Self { + relation, + name, + spans: Spans::new(), + }) } /// Deserialize a fully qualified name string into a column @@ -245,7 +249,14 @@ impl Column { // If not due to USING columns then due to ambiguous column name return _schema_err!(SchemaError::AmbiguousReference { - field: Column::new_unqualified(self.name), + field: Column::new_unqualified(&self.name), + }) + .map_err(|err| { + let diagnostic = Diagnostic::new().with_error( + format!("column '{}' is ambiguous", &self.name), + self.spans().first_or_empty(), + ); + err.with_diagnostic(diagnostic) }); } } From e44cb9f963c1d3af6fe68c1ae50462b13b254833 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 14:48:56 +0100 Subject: [PATCH 11/33] feat: possible references --- datafusion/common/src/column.rs | 19 ++++++++++++++++++- datafusion/common/src/diagnostic.rs | 18 ++++++++++++++++++ 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 6a343443c2af..c2dc7256cba4 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -18,6 +18,7 @@ //! Column use arrow_schema::{Field, FieldRef}; +use sqlparser::tokenizer::Span; use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; @@ -252,10 +253,26 @@ impl Column { field: Column::new_unqualified(&self.name), }) .map_err(|err| { - let diagnostic = Diagnostic::new().with_error( + let mut diagnostic = Diagnostic::new().with_error( format!("column '{}' is ambiguous", &self.name), self.spans().first_or_empty(), ); + // TODO If [`DFSchema`] had spans, we could show the + // user which columns are candidates, or which table + // they come from. For now, let's list the table names + // only. + for qualified_field in qualified_fields { + let (Some(table), _) = qualified_field else { + continue; + }; + diagnostic.add_note( + format!( + "possible reference to '{}' in table '{}'", + &self.name, table + ), + Span::empty(), + ); + } err.with_diagnostic(diagnostic) }); } diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 9d4d39fca841..a4653bdb15a9 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -56,11 +56,29 @@ macro_rules! with_kind { }; } +macro_rules! add_kind { + ($name:ident, $kind:expr) => { + pub fn $name(&mut self, message: impl Into, span: Span) { + let entry = DiagnosticEntry { + span, + message: message.into(), + kind: $kind, + }; + self.entries.push(entry); + } + }; +} + impl Diagnostic { with_kind!(with_error, DiagnosticEntryKind::Error); with_kind!(with_warning, DiagnosticEntryKind::Warning); with_kind!(with_note, DiagnosticEntryKind::Note); with_kind!(with_help, DiagnosticEntryKind::Help); + + add_kind!(add_error, DiagnosticEntryKind::Error); + add_kind!(add_warning, DiagnosticEntryKind::Warning); + add_kind!(add_note, DiagnosticEntryKind::Note); + add_kind!(add_help, DiagnosticEntryKind::Help); } impl DiagnosticEntry { From 31772ed00d083704e523354cee0fc1cd308e32c6 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 15:21:48 +0100 Subject: [PATCH 12/33] feat: group by --- datafusion/sql/src/select.rs | 9 +++--- datafusion/sql/src/utils.rs | 55 +++++++++++++++++++++++++++++------- 2 files changed, 49 insertions(+), 15 deletions(-) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index b160c9d8748d..05138405deb9 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -21,7 +21,7 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::{ check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, - resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, + resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, CheckColumnsSatisfyExprsPurpose, }; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; @@ -41,8 +41,7 @@ use datafusion_expr::{ use indexmap::IndexMap; use sqlparser::ast::{ - Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, - WildcardAdditionalOptions, WindowType, + Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, Spanned, WildcardAdditionalOptions, WindowType }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; @@ -796,7 +795,7 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, &select_exprs_post_aggr, - "Projection references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate, )?; // Rewrite the HAVING expression to use the columns produced by the @@ -808,7 +807,7 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, std::slice::from_ref(&having_expr_post_aggr), - "HAVING clause references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate, )?; Some(having_expr_post_aggr) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 74ead8cf94ea..b7970102ad31 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; use datafusion_common::{ - exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, HashMap, - Result, ScalarValue, + exec_err, internal_err, plan_err, schema_err, Column, DFSchemaRef, DataFusionError, + Diagnostic, HashMap, Result, ScalarValue, SchemaError, }; use datafusion_expr::builder::get_struct_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; @@ -38,6 +38,7 @@ use datafusion_expr::{ use indexmap::IndexMap; use sqlparser::ast::{Ident, Value}; +use sqlparser::tokenizer::Span; /// Make a best-effort attempt at resolving all columns in the expression tree pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { @@ -90,12 +91,35 @@ pub(crate) fn rebase_expr( .data() } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum CheckColumnsSatisfyExprsPurpose { + ProjectionMustReferenceAggregate, + HavingMustReferenceAggregate, +} + +impl CheckColumnsSatisfyExprsPurpose { + fn message_prefix(&self) -> &'static str { + match self { + CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => { + "Projection references non-aggregate values" + } + CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => { + "HAVING clause references non-aggregate values" + } + } + } + + fn diagnostic_message(&self, expr: &Expr) -> String { + format!("'{expr}' must appear in GROUP BY clause because it's not an aggregate expression") + } +} + /// Determines if the set of `Expr`'s are a valid projection on the input /// `Expr::Column`'s. pub(crate) fn check_columns_satisfy_exprs( columns: &[Expr], exprs: &[Expr], - message_prefix: &str, + purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { columns.iter().try_for_each(|c| match c { Expr::Column(_) => Ok(()), @@ -106,22 +130,22 @@ pub(crate) fn check_columns_satisfy_exprs( match e { Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } Expr::GroupingSet(GroupingSet::Cube(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { for exprs in lists_of_exprs { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } } - _ => check_column_satisfies_expr(columns, e, message_prefix)?, + _ => check_column_satisfies_expr(columns, e, purpose)?, } } Ok(()) @@ -130,15 +154,26 @@ pub(crate) fn check_columns_satisfy_exprs( fn check_column_satisfies_expr( columns: &[Expr], expr: &Expr, - message_prefix: &str, + purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { if !columns.contains(expr) { return plan_err!( "{}: Expression {} could not be resolved from available columns: {}", - message_prefix, + purpose.message_prefix(), expr, expr_vec_fmt!(columns) - ); + ) + .map_err(|err| { + let diagnostic = Diagnostic::new() + .with_error( + purpose.diagnostic_message(expr), + expr.spans() + .map(|spans| spans.first_or_empty()) + .unwrap_or(Span::empty()), + ) + .with_help(format!("add '{expr}' to GROUP BY clause"), Span::empty()); + err.with_diagnostic(diagnostic) + }); } Ok(()) } From c15d8f5cfdcbf0469280005d47e0029c143acb68 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 15:22:10 +0100 Subject: [PATCH 13/33] chore: fmt --- datafusion/common/src/dfschema.rs | 4 +--- datafusion/common/src/lib.rs | 8 ++++---- datafusion/common/src/spans.rs | 4 ++-- datafusion/expr/src/expr.rs | 8 ++++++-- datafusion/expr/src/expr_rewriter/mod.rs | 6 +++++- datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs | 2 +- datafusion/optimizer/src/decorrelate.rs | 6 +++++- datafusion/sql/src/select.rs | 6 ++++-- 8 files changed, 28 insertions(+), 16 deletions(-) diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index e3f36fb17af5..2ac629432ce9 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -502,9 +502,7 @@ impl DFSchema { Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) } else { _schema_err!(SchemaError::AmbiguousReference { - field: Column::new_unqualified( - name.to_string(), - ), + field: Column::new_unqualified(name.to_string(),), }) } } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 565d6064ee62..b62f22a60160 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -33,6 +33,7 @@ pub mod alias; pub mod cast; pub mod config; pub mod cse; +pub mod diagnostic; pub mod display; pub mod error; pub mod file_options; @@ -42,13 +43,12 @@ pub mod instant; pub mod parsers; pub mod rounding; pub mod scalar; +pub mod spans; pub mod stats; pub mod test_util; pub mod tree_node; pub mod types; pub mod utils; -pub mod diagnostic; -pub mod spans; /// Reexport arrow crate pub use arrow; @@ -56,6 +56,7 @@ pub use column::Column; pub use dfschema::{ qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, }; +pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, @@ -74,12 +75,11 @@ pub use join_type::{JoinConstraint, JoinSide, JoinType}; pub use param_value::ParamValues; pub use scalar::{ScalarType, ScalarValue}; pub use schema_reference::SchemaReference; +pub use spans::Spans; pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; pub use utils::project_schema; -pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; -pub use spans::Spans; // These are hidden from docs purely to avoid polluting the public view of what this crate exports. // These are just re-exports of macros by the same name, which gets around the 'cannot refer to diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 0b6e7d00fd5d..1dd8d9c2c1ea 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -9,8 +9,8 @@ use std::hash::{Hash, Hasher}; /// the column a that comes from SELECT 1 AS a UNION ALL SELECT 2 AS a you'll /// need two spans. #[derive(Debug, Clone)] - // Store teh first [`Span`] on the stack because that is by far the most common - // case. More will spill onto the heap. +// Store teh first [`Span`] on the stack because that is by far the most common +// case. More will spill onto the heap. pub struct Spans(pub SmallVec<[Span; 1]>); impl Spans { diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 20b82a5e3754..57eec81f419c 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -35,7 +35,7 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeContainer, TreeNodeRecursion, }; use datafusion_common::{ - plan_err, Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference + plan_err, Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use sqlparser::ast::{ @@ -1111,7 +1111,11 @@ impl Expr { /// output schema. We can use this qualified name to reference the field. pub fn qualified_name(&self) -> (Option, String) { match self { - Expr::Column(Column { relation, name, spans: _ }) => (relation.clone(), name.clone()), + Expr::Column(Column { + relation, + name, + spans: _, + }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), } diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 6746fba7870c..d0fafc70539d 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -178,7 +178,11 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { relation: _, name, spans }) => Ok(Column::new( + Expr::Column(Column { + relation: _, + name, + spans, + }) => Ok(Column::new( Some::(subqry_alias.into()), name, )), diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index e887e3c50e6f..7df4e970ada2 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -124,7 +124,7 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { ref relation, ref name, // TODO Should we use these spans? - spans: _ + spans: _, }) => { if name.eq("*") { if let Some(qualifier) = relation { diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index 50c24949d766..36da61f4e8fe 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -533,7 +533,11 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name, spans: _ }) => name.to_string(), + Expr::Column(Column { + relation: _, + name, + spans: _, + }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index 05138405deb9..f21500276bb9 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -21,7 +21,8 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::{ check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, - resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, CheckColumnsSatisfyExprsPurpose, + resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, + CheckColumnsSatisfyExprsPurpose, }; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; @@ -41,7 +42,8 @@ use datafusion_expr::{ use indexmap::IndexMap; use sqlparser::ast::{ - Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, Spanned, WildcardAdditionalOptions, WindowType + Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, Spanned, + WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; From 09d7065fa6eb6d87775c3433919ba840087264b1 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 15:24:51 +0100 Subject: [PATCH 14/33] feat: relay `Spans` in `create_col_from_scalar_expr` --- datafusion/common/src/column.rs | 5 +++++ datafusion/expr/src/expr_rewriter/mod.rs | 8 ++++---- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index c2dc7256cba4..a9f146b1f49e 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -296,6 +296,11 @@ impl Column { pub fn spans_mut(&mut self) -> &mut Spans { &mut self.spans } + + pub fn with_spans(mut self, spans: Spans) -> Self { + self.spans = spans; + self + } } impl From<&str> for Column { diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index d0fafc70539d..77cfaf5295f1 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -182,10 +182,10 @@ pub fn create_col_from_scalar_expr( relation: _, name, spans, - }) => Ok(Column::new( - Some::(subqry_alias.into()), - name, - )), + }) => Ok( + Column::new(Some::(subqry_alias.into()), name) + .with_spans(spans.clone()), + ), _ => { let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( From 1d5c64724817df441bec5fc48c7f7ef9cf456504 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 15:29:25 +0100 Subject: [PATCH 15/33] chore: cargo check --- datafusion/sql/src/expr/identifier.rs | 2 +- datafusion/sql/src/expr/mod.rs | 11 ----------- datafusion/sql/src/planner.rs | 2 +- datafusion/sql/src/relation/mod.rs | 2 +- datafusion/sql/src/select.rs | 2 +- datafusion/sql/src/utils.rs | 4 ++-- 6 files changed, 6 insertions(+), 17 deletions(-) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 3ef6d8a5805a..7e4a9763febb 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -22,7 +22,7 @@ use datafusion_common::{ }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; -use sqlparser::ast::{Expr as SQLExpr, Ident, Spanned}; +use sqlparser::ast::{Expr as SQLExpr, Ident}; use sqlparser::tokenizer::Span; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index dac176e5ffc0..bff298a8f8e2 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,21 +15,12 @@ // specific language governing permissions and limitations // under the License. -use std::ops::ControlFlow; use arrow_schema::DataType; use arrow_schema::TimeUnit; -use datafusion_common::Column; -use datafusion_common::DataFusionError; -use datafusion_common::Diagnostic; -use datafusion_common::SchemaError; use datafusion_expr::planner::{ PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, }; -use datafusion_expr::utils::find_column_exprs; -use sqlparser::ast::Spanned; -use sqlparser::ast::Visit; -use sqlparser::ast::Visitor; use sqlparser::ast::{ BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, DictionaryField, Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, @@ -46,9 +37,7 @@ use datafusion_expr::{ lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal, Operator, TryCast, }; -use sqlparser::tokenizer::Span; -use crate::planner::IdentNormalizer; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; mod binary_op; diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 83c75b5548b1..f7ea7e9e9254 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -32,7 +32,7 @@ use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; use datafusion_common::TableReference; use datafusion_common::{ - not_impl_err, plan_err, unqualified_field_not_found, DFSchema, DataFusionError, + not_impl_err, plan_err, DFSchema, DataFusionError, Result, }; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 4d2ee3cc942b..355f380372dc 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -21,7 +21,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ - not_impl_err, plan_err, DFSchema, Diagnostic, DiagnosticEntry, DiagnosticEntryKind, + not_impl_err, plan_err, DFSchema, Diagnostic, Result, TableReference, }; use datafusion_expr::builder::subquery_alias; diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index f21500276bb9..50d89dea6763 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -42,7 +42,7 @@ use datafusion_expr::{ use indexmap::IndexMap; use sqlparser::ast::{ - Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, Spanned, + Distinct, Expr as SQLExpr, GroupByExpr, NamedWindowExpr, OrderByExpr, WildcardAdditionalOptions, WindowType, }; use sqlparser::ast::{NamedWindowDefinition, Select, SelectItem, TableWithJoins}; diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index b7970102ad31..492d9bdbac8f 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; use datafusion_common::{ - exec_err, internal_err, plan_err, schema_err, Column, DFSchemaRef, DataFusionError, - Diagnostic, HashMap, Result, ScalarValue, SchemaError, + exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, + Diagnostic, HashMap, Result, ScalarValue, }; use datafusion_expr::builder::get_struct_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; From 5fdc10e431222d3e19842138c2b641ffe4283095 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Thu, 23 Jan 2025 15:30:06 +0100 Subject: [PATCH 16/33] chore: cargo fmt --- datafusion/sql/src/expr/mod.rs | 1 - datafusion/sql/src/planner.rs | 5 +---- datafusion/sql/src/relation/mod.rs | 3 +-- datafusion/sql/src/utils.rs | 4 ++-- 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index bff298a8f8e2..d34ac787c827 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. - use arrow_schema::DataType; use arrow_schema::TimeUnit; use datafusion_expr::planner::{ diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index f7ea7e9e9254..8d8f1d05b4aa 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -31,10 +31,7 @@ use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; use datafusion_common::TableReference; -use datafusion_common::{ - not_impl_err, plan_err, DFSchema, DataFusionError, - Result, -}; +use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result}; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 355f380372dc..11b87369db39 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -21,8 +21,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ - not_impl_err, plan_err, DFSchema, Diagnostic, - Result, TableReference, + not_impl_err, plan_err, DFSchema, Diagnostic, Result, TableReference, }; use datafusion_expr::builder::subquery_alias; use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder}; diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 492d9bdbac8f..1ca86b7e5cde 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; use datafusion_common::{ - exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, - Diagnostic, HashMap, Result, ScalarValue, + exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, Diagnostic, + HashMap, Result, ScalarValue, }; use datafusion_expr::builder::get_struct_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; From 6f6722d4a8c71706a15e420e607d7a1ad176500c Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 09:22:01 +0100 Subject: [PATCH 17/33] feat: diagnostic can only be either error or warning --- datafusion/common/src/column.rs | 2 +- datafusion/common/src/diagnostic.rs | 131 ++++++++---------- datafusion/common/src/error.rs | 2 +- datafusion/common/src/lib.rs | 2 +- .../expr-common/src/type_coercion/binary.rs | 3 +- datafusion/sql/src/planner.rs | 4 +- datafusion/sql/src/relation/mod.rs | 2 +- datafusion/sql/src/set_expr.rs | 3 +- datafusion/sql/src/utils.rs | 15 +- 9 files changed, 72 insertions(+), 92 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index a9f146b1f49e..2382ea5d4d0d 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -253,7 +253,7 @@ impl Column { field: Column::new_unqualified(&self.name), }) .map_err(|err| { - let mut diagnostic = Diagnostic::new().with_error( + let mut diagnostic = Diagnostic::new_error( format!("column '{}' is ambiguous", &self.name), self.spans().first_or_empty(), ); diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index a4653bdb15a9..d5c20a535b08 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -1,103 +1,86 @@ use sqlparser::tokenizer::Span; +/// Additional contextual information intended for end users, to help them +/// understand what went wrong by providing human-readable messages, and +/// locations in the source query that relate to the error in some way. +/// +/// You can think of a single [`Diagnostic`] as a single "block" of output from +/// rustc. i.e. either an error or a warning, optionally with some notes and +/// help messages. +/// +/// If the diagnostic, a note, or a help message doesn't need to point to a +/// specific location in the original SQL query (or the [`Span`] is not +/// available), use [`Span::empty`]. +/// +/// Example: +/// +/// ```rust +/// let diagnostic = Diagnostic::new_error("Something went wrong", span) +/// .with_help("Have you tried turning it on and off again?", Span::empty()); +/// ``` #[derive(Debug, Clone)] pub struct Diagnostic { - pub entries: Vec, + pub kind: DiagnosticKind, + pub message: String, + pub span: Span, + pub notes: Vec, + pub helps: Vec, } #[derive(Debug, Clone)] -pub struct DiagnosticEntry { +pub struct DiagnosticNote { + pub message: String, pub span: Span, +} + +#[derive(Debug, Clone)] +pub struct DiagnosticHelp { pub message: String, - pub kind: DiagnosticEntryKind, + pub span: Span, } #[derive(Debug, Clone, Copy, PartialEq, Eq)] -pub enum DiagnosticEntryKind { +pub enum DiagnosticKind { Error, Warning, - Note, - Help, } impl Diagnostic { - pub fn new() -> Self { - Default::default() - } -} - -impl Default for Diagnostic { - fn default() -> Self { - Diagnostic { - entries: Vec::new(), + pub fn new_error(message: impl Into, span: Span) -> Self { + Self { + kind: DiagnosticKind::Error, + message: message.into(), + span, + notes: Vec::new(), + helps: Vec::new(), } } -} -impl FromIterator for Diagnostic { - fn from_iter>(iter: T) -> Self { - Diagnostic { - entries: iter.into_iter().collect(), + pub fn new_warning(message: impl Into, span: Span) -> Self { + Self { + kind: DiagnosticKind::Warning, + message: message.into(), + span, + notes: Vec::new(), + helps: Vec::new(), } } -} - -macro_rules! with_kind { - ($name:ident, $kind:expr) => { - pub fn $name(mut self, message: impl Into, span: Span) -> Self { - let entry = DiagnosticEntry { - span, - message: message.into(), - kind: $kind, - }; - self.entries.push(entry); - self - } - }; -} - -macro_rules! add_kind { - ($name:ident, $kind:expr) => { - pub fn $name(&mut self, message: impl Into, span: Span) { - let entry = DiagnosticEntry { - span, - message: message.into(), - kind: $kind, - }; - self.entries.push(entry); - } - }; -} -impl Diagnostic { - with_kind!(with_error, DiagnosticEntryKind::Error); - with_kind!(with_warning, DiagnosticEntryKind::Warning); - with_kind!(with_note, DiagnosticEntryKind::Note); - with_kind!(with_help, DiagnosticEntryKind::Help); + pub fn add_note(&mut self, message: String, span: Span) { + self.notes.push(DiagnosticNote { message, span }); + } - add_kind!(add_error, DiagnosticEntryKind::Error); - add_kind!(add_warning, DiagnosticEntryKind::Warning); - add_kind!(add_note, DiagnosticEntryKind::Note); - add_kind!(add_help, DiagnosticEntryKind::Help); -} + pub fn add_help(&mut self, message: String, span: Span) { + self.helps.push(DiagnosticHelp { message, span }); + } -impl DiagnosticEntry { - pub fn new( - message: impl Into, - kind: DiagnosticEntryKind, - span: Span, - ) -> Self { - DiagnosticEntry { - span, - message: message.into(), - kind, - } + pub fn with_note(mut self, message: String, span: Span) -> Self { + self.add_note(message, span); + self } - pub fn new_without_span( - message: impl Into, - kind: DiagnosticEntryKind, - ) -> Self { - Self::new(message, kind, Span::empty()) + pub fn with_help(mut self, message: String, span: Span) -> Self { + self.add_help(message, span); + self } } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 26479d8d3ada..2e501946a4d1 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -508,7 +508,7 @@ impl DataFusionError { self.with_diagnostic(diagnostic) } - pub fn get_diagnostics(&self) -> impl Iterator + '_ { + pub fn diagnostics(&self) -> impl Iterator + '_ { struct DiagnosticsIterator<'a> { head: &'a DataFusionError, } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index b62f22a60160..e2c059a762fe 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -56,7 +56,7 @@ pub use column::Column; pub use dfschema::{ qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, }; -pub use diagnostic::{Diagnostic, DiagnosticEntry, DiagnosticEntryKind}; +pub use diagnostic::Diagnostic; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 3c0a7f15a385..ff60df6813a8 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -235,8 +235,7 @@ impl<'a> BinaryTypeCoercer<'a> { self.rhs_spans.first_or_empty() ].iter().copied().filter(|span| span != &Span::empty()) ); - let diagnostic = Diagnostic::new() - .with_error("expressions have incompatible types", binary_expr_span) + let diagnostic = Diagnostic::new_error("expressions have incompatible types", binary_expr_span) .with_note(format!("has type {}", self.lhs), self.lhs_spans.first_or_empty()) .with_note(format!("has type {}", self.rhs), self.rhs_spans.first_or_empty()); err.with_diagnostic(diagnostic) diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 8d8f1d05b4aa..d1b7647f4aab 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -372,7 +372,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { _, ) => { let diagnostic = if let Some(relation) = &col.relation { - Diagnostic::new().with_error( + Diagnostic::new_error( format!( "column '{}' not found in '{}'", &col.name, @@ -381,7 +381,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { col.spans().first_or_empty(), ) } else { - Diagnostic::new().with_error( + Diagnostic::new_error( format!("column '{}' not found", &col.name), col.spans().first_or_empty(), ) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 11b87369db39..3ed4787eeffa 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -90,7 +90,7 @@ impl SqlToRel<'_, S> { )? .build(), (None, Err(e)) => { - let e = e.with_diagnostic(Diagnostic::new().with_error( + let e = e.with_diagnostic(Diagnostic::new_error( format!("table '{}' not found", table_ref), relation_span, )); diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 4ce84faf7a37..25f530528ab9 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -89,8 +89,7 @@ impl SqlToRel<'_, S> { plan_err!("{} queries have different number of columns", op).map_err(|err| { err.with_diagnostic( - Diagnostic::new() - .with_error( + Diagnostic::new_error( format!("{} queries have different number of columns", op), set_expr_span, ) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 1ca86b7e5cde..4953f0fed91e 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -164,14 +164,13 @@ fn check_column_satisfies_expr( expr_vec_fmt!(columns) ) .map_err(|err| { - let diagnostic = Diagnostic::new() - .with_error( - purpose.diagnostic_message(expr), - expr.spans() - .map(|spans| spans.first_or_empty()) - .unwrap_or(Span::empty()), - ) - .with_help(format!("add '{expr}' to GROUP BY clause"), Span::empty()); + let diagnostic = Diagnostic::new_error( + purpose.diagnostic_message(expr), + expr.spans() + .map(|spans| spans.first_or_empty()) + .unwrap_or(Span::empty()), + ) + .with_help(format!("add '{expr}' to GROUP BY clause"), Span::empty()); err.with_diagnostic(diagnostic) }); } From 9a8ef789dbd2a6079cc43fec5463ed4fc101412e Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 09:40:53 +0100 Subject: [PATCH 18/33] feat: simplify to one `Diagnostic` per error --- datafusion/common/src/error.rs | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 2e501946a4d1..3f23165f6a73 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -508,7 +508,9 @@ impl DataFusionError { self.with_diagnostic(diagnostic) } - pub fn diagnostics(&self) -> impl Iterator + '_ { + /// Gets the [`Diagnostic`] associated with the error, if any. If there is + /// more than one, only the outermost [`Diagnostic`] is returned. + pub fn diagnostic(&self) -> Option<&Diagnostic> { struct DiagnosticsIterator<'a> { head: &'a DataFusionError, } @@ -536,7 +538,7 @@ impl DataFusionError { } } - DiagnosticsIterator { head: self } + DiagnosticsIterator { head: self }.next() } } From fd46dced8c00b2b67c83574e1aff3dc01846dc12 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 10:40:27 +0100 Subject: [PATCH 19/33] test: add tests --- datafusion/common/src/diagnostic.rs | 8 +- datafusion/core/tests/diagnostics.rs | 214 +++++++++++++++++++++++++++ datafusion/sql/src/set_expr.rs | 29 ++-- 3 files changed, 231 insertions(+), 20 deletions(-) create mode 100644 datafusion/core/tests/diagnostics.rs diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index d5c20a535b08..38be97e1b9ed 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -3,17 +3,17 @@ use sqlparser::tokenizer::Span; /// Additional contextual information intended for end users, to help them /// understand what went wrong by providing human-readable messages, and /// locations in the source query that relate to the error in some way. -/// +/// /// You can think of a single [`Diagnostic`] as a single "block" of output from /// rustc. i.e. either an error or a warning, optionally with some notes and /// help messages. -/// +/// /// If the diagnostic, a note, or a help message doesn't need to point to a /// specific location in the original SQL query (or the [`Span`] is not /// available), use [`Span::empty`]. -/// +/// /// Example: -/// +/// /// ```rust /// let diagnostic = Diagnostic::new_error("Something went wrong", span) /// .with_help("Have you tried turning it on and off again?", Span::empty()); diff --git a/datafusion/core/tests/diagnostics.rs b/datafusion/core/tests/diagnostics.rs new file mode 100644 index 000000000000..923c49be964b --- /dev/null +++ b/datafusion/core/tests/diagnostics.rs @@ -0,0 +1,214 @@ +use std::collections::HashMap; + +use arrow_schema::{DataType, Field, Schema}; +use datafusion::error::Result; +use datafusion::prelude::*; +use datafusion_common::Diagnostic; +use regex::Regex; +use sqlparser::tokenizer::{Location, Span}; + +async fn cars_query(sql: &'static str) -> Diagnostic { + let config = + SessionConfig::new().set_bool("datafusion.sql_parser.collect_spans", true); + let ctx = SessionContext::new_with_config(config); + let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string(); + let schema = Schema::new(vec![ + Field::new("car", DataType::Utf8, false), + Field::new("speed", DataType::Float32, false), + Field::new("time", DataType::Utf8, false), + ]); + let read_options = CsvReadOptions::default().has_header(true).schema(&schema); + ctx.register_csv("cars", &csv_path, read_options) + .await + .expect("error registering cars.csv"); + match ctx.sql(sql).await { + Ok(_) => panic!("expected error"), + Err(err) => match err.diagnostic() { + Some(diag) => diag.clone(), + None => panic!("expected diagnostic"), + }, + } +} + +/// Given a query that contains tag delimited spans, returns a mapping from the +/// span name to the [`Span`]. Tags are comments of the form `/*tag*/`. In case +/// you want the same location to open two spans, or close open and open +/// another, you can use '+' to separate the names (the order matters). The +/// names of spans can only contain letters, digits, underscores, and dashes. +/// +/// +/// ## Example +/// +/// ```rust +/// let spans = get_spans("SELECT /*myspan*/car/*myspan*/ FROM cars"); +/// // myspan is ^^^ +/// dbg!(&spans["myspan"]); +/// ``` +/// +/// ## Example +/// +/// ```rust +/// let spans = get_spans("SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars"); +/// // whole is ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// // left is ^^^^^ +/// // right is ^^ +/// dbg!(&spans["whole"]); +/// dbg!(&spans["left"]); +/// dbg!(&spans["right"]); +/// ``` +fn get_spans(query: &'static str) -> HashMap { + let mut spans = HashMap::new(); + + let mut bytes_per_line = vec![]; + for line in query.lines() { + bytes_per_line.push(line.len()); + } + + let byte_offset_to_loc = |s: &str, byte_offset: usize| -> Location { + let mut line = 1; + let mut column = 1; + for (i, c) in s.chars().enumerate() { + if i == byte_offset { + return Location { line, column }; + } + if c == '\n' { + line += 1; + column = 1; + } else { + column += 1; + } + } + Location { line, column } + }; + + let re = Regex::new(r#"/\*([\w\d\+_-]+)\*/"#).unwrap(); + let mut stack: Vec<(String, usize)> = vec![]; + for c in re.captures_iter(query) { + let m = c.get(0).unwrap(); + let tags = c.get(1).unwrap().as_str().split("+").collect::>(); + + for tag in tags { + if stack.last().map(|(top_tag, _)| top_tag.as_str()) == Some(tag) { + let (_, start) = stack.pop().unwrap(); + let end = m.start(); + + spans.insert( + tag.to_string(), + Span::new( + byte_offset_to_loc(query, start), + byte_offset_to_loc(query, end), + ), + ); + } else { + stack.push((tag.to_string(), m.end())); + } + } + } + + if !stack.is_empty() { + panic!("unbalanced tags"); + } + + spans +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_table_not_found() -> Result<()> { + let query = "SELECT * FROM /*a*/carz/*a*/"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!(diag.message, "table 'carz' not found"); + assert_eq!(diag.span, spans["a"]); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_unqualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/speedz/*a*/ FROM cars"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!(diag.message, "column 'speedz' not found"); + assert_eq!(diag.span, spans["a"]); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_qualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/cars.speedz/*a*/ FROM cars"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!(diag.message, "column 'speedz' not found in 'cars'"); + assert_eq!(diag.span, spans["a"]); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_union_wrong_number_of_columns() -> Result<()> { + let query = "/*whole+left*/SELECT speed FROM cars/*left*/ UNION ALL /*right*/SELECT speed, time FROM cars/*right+whole*/"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!( + diag.message, + "UNION queries have different number of columns" + ); + assert_eq!(diag.span, spans["whole"]); + assert_eq!(diag.notes[0].message, "this side has 1 fields"); + assert_eq!(diag.notes[0].span, spans["left"]); + assert_eq!(diag.notes[1].message, "this side has 2 fields"); + assert_eq!(diag.notes[1].span, spans["right"]); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_missing_non_aggregate_in_group_by() -> Result<()> { + let query = "SELECT car, /*a*/speed/*a*/ FROM cars GROUP BY car"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!( + diag.message, + "'cars.speed' must appear in GROUP BY clause because it's not an aggregate expression" + ); + assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.helps[0].message, "add 'cars.speed' to GROUP BY clause"); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_ambiguous_reference() -> Result<()> { + let query = "SELECT /*a*/car/*a*/ FROM cars a, cars b"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!(diag.message, "column 'car' is ambiguous"); + assert_eq!(diag.span, spans["a"]); + assert_eq!( + diag.notes[0].message, + "possible reference to 'car' in table 'a'" + ); + assert_eq!( + diag.notes[1].message, + "possible reference to 'car' in table 'b'" + ); + Ok(()) +} + +#[cfg(test)] +#[tokio::test] +pub async fn test_incompatible_types_binary_arithmetic() -> Result<()> { + let query = + "SELECT /*whole+left*/car/*left*/ + /*right*/speed/*right+whole*/ FROM cars"; + let spans = get_spans(query); + let diag = cars_query(query).await; + assert_eq!(diag.message, "expressions have incompatible types"); + assert_eq!(diag.span, spans["whole"]); + assert_eq!(diag.notes[0].message, "has type Utf8"); + assert_eq!(diag.notes[0].span, spans["left"]); + assert_eq!(diag.notes[1].message, "has type Float32"); + assert_eq!(diag.notes[1].span, spans["right"]); + Ok(()) +} diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 25f530528ab9..2b7f160ee828 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -90,23 +90,20 @@ impl SqlToRel<'_, S> { plan_err!("{} queries have different number of columns", op).map_err(|err| { err.with_diagnostic( Diagnostic::new_error( - format!("{} queries have different number of columns", op), - set_expr_span, - ) - .with_note( - format!( - "this side has {} fields", - left_plan.schema().fields().len() - ), - left.span(), - ) - .with_note( - format!( - "this side has {} fields", - right_plan.schema().fields().len() - ), - right.span(), + format!("{} queries have different number of columns", op), + set_expr_span, + ) + .with_note( + format!("this side has {} fields", left_plan.schema().fields().len()), + left.span(), + ) + .with_note( + format!( + "this side has {} fields", + right_plan.schema().fields().len() ), + right.span(), + ), ) }) } From 360ac202dd00aa10a165a145472c81e0c4d28c9f Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 12:37:31 +0100 Subject: [PATCH 20/33] chore: add license headers --- datafusion/common/src/diagnostic.rs | 17 +++++++++++++++++ datafusion/common/src/spans.rs | 17 +++++++++++++++++ datafusion/core/tests/diagnostics.rs | 17 +++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 38be97e1b9ed..24738d4c5a10 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use sqlparser::tokenizer::Span; /// Additional contextual information intended for end users, to help them diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 1dd8d9c2c1ea..544f26b6b924 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use smallvec::SmallVec; use sqlparser::tokenizer::Span; use std::cmp::Ordering; diff --git a/datafusion/core/tests/diagnostics.rs b/datafusion/core/tests/diagnostics.rs index 923c49be964b..258390acc7ea 100644 --- a/datafusion/core/tests/diagnostics.rs +++ b/datafusion/core/tests/diagnostics.rs @@ -1,3 +1,20 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + use std::collections::HashMap; use arrow_schema::{DataType, Field, Schema}; From 3dd66fc8875423bbc43e06afae14a78989788bb1 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 12:38:32 +0100 Subject: [PATCH 21/33] chore: update CLI lockfile --- datafusion-cli/Cargo.lock | 58 ++++++++++++++++++++------------------- 1 file changed, 30 insertions(+), 28 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index ae9c82fa1249..15c1d8778d0a 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -863,9 +863,9 @@ dependencies = [ [[package]] name = "brotli-decompressor" -version = "4.0.1" +version = "4.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9a45bd2e4095a8b518033b128020dd4a55aab1c0a381ba4404a472630f4bc362" +checksum = "74fa05ad7d803d413eb8380983b092cbbaf9a85f151b871360e7b00cd7060b37" dependencies = [ "alloc-no-stdlib", "alloc-stdlib", @@ -943,9 +943,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.2.9" +version = "1.2.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8293772165d9345bdaaa39b45b2109591e63fe5e6fbc23c6ff930a048aa310b" +checksum = "13208fcbb66eaeffe09b99fffbe1af420f00a7b35aa99ad683dfc1aa76145229" dependencies = [ "jobserver", "libc", @@ -985,9 +985,9 @@ dependencies = [ [[package]] name = "chrono-tz" -version = "0.10.0" +version = "0.10.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cd6dd8046d00723a59a2f8c5f295c515b9bb9a331ee4f8f3d4dd49e428acd3b6" +checksum = "9c6ac4f2c0bf0f44e9161aec9675e1050aa4a530663c4a9e37e108fa948bca9f" dependencies = [ "chrono", "chrono-tz-build", @@ -1006,9 +1006,9 @@ dependencies = [ [[package]] name = "clap" -version = "4.5.26" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a8eb5e908ef3a6efbe1ed62520fb7287959888c88485abe072543190ecc66783" +checksum = "769b0145982b4b48713e01ec42d61614425f27b7058bda7180a3a41f30104796" dependencies = [ "clap_builder", "clap_derive", @@ -1016,9 +1016,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.26" +version = "4.5.27" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "96b01801b5fc6a0a232407abc821660c9c6d25a1cafc0d4f85f29fb8d9afc121" +checksum = "1b26884eb4b57140e4d2d93652abfa49498b938b3c9179f9fc487b0acc3edad7" dependencies = [ "anstream", "anstyle", @@ -1157,9 +1157,9 @@ checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" [[package]] name = "crunchy" -version = "0.2.2" +version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" +checksum = "43da5946c66ffcc7745f48db692ffbb10a83bfe0afd96235c5c2a4fb23994929" [[package]] name = "crypto-common" @@ -1340,6 +1340,7 @@ dependencies = [ "parquet", "paste", "recursive", + "smallvec", "sqlparser", "tokio", "web-time", @@ -1400,6 +1401,7 @@ dependencies = [ "arrow", "datafusion-common", "itertools 0.14.0", + "sqlparser", ] [[package]] @@ -2398,9 +2400,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown 0.15.2", @@ -2414,9 +2416,9 @@ checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02" [[package]] name = "ipnet" -version = "2.10.1" +version = "2.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddc24109865250148c2e0f3d25d4f0f479571723792d3802153c60922a4fb708" +checksum = "469fb0b9cefa57e3ef31275ee7cacb78f2fdca44e4765491884a2b119d4eb130" [[package]] name = "is_terminal_polyfill" @@ -2848,9 +2850,9 @@ checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "openssl-probe" -version = "0.1.5" +version = "0.1.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ff011a302c396a5197692431fc1948019154afc178baf7d8e37367442a4601cf" +checksum = "d05e27ee213611ffe7d6348b942e8f942b37114c00cc03cec254295a4a17852e" [[package]] name = "option-ext" @@ -3409,9 +3411,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.43" +version = "0.38.44" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a78891ee6bf2340288408954ac787aa063d8e8817e9f53abb37c695c6d834ef6" +checksum = "fdb5bc1ae2baa591800df16c9ca78619bf65c0488b41b96ccec5d11220d8c154" dependencies = [ "bitflags 2.8.0", "errno", @@ -3624,9 +3626,9 @@ dependencies = [ [[package]] name = "semver" -version = "1.0.24" +version = "1.0.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3cb6eb87a131f756572d7fb904f6e7b68633f09cca868c5df1c4b8d1a694bbba" +checksum = "f79dfe2d285b0488816f30e700a7438c5a73d816b5b7d3ac72fbc48b0d185e03" [[package]] name = "seq-macro" @@ -3665,9 +3667,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.135" +version = "1.0.137" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2b0d7ba2887406110130a978386c4e1befb98c674b4fba677954e4db976630d9" +checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" dependencies = [ "itoa", "memchr", @@ -4201,9 +4203,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-ident" -version = "1.0.14" +version = "1.0.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" +checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" [[package]] name = "unicode-segmentation" @@ -4266,9 +4268,9 @@ checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" [[package]] name = "uuid" -version = "1.12.0" +version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "744018581f9a3454a9e15beb8a33b017183f1e7c0cd170232a2d1453b23a51c4" +checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ "getrandom", "serde", From 13c4c9d8daaaf5faeccf1ac57bf4f62c715b7667 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 12:49:38 +0100 Subject: [PATCH 22/33] chore: taplo format --- datafusion/common/Cargo.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 1e634d5ee8cd..00c2cfecc6e1 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -66,9 +66,9 @@ parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" pyo3 = { version = "0.23.3", optional = true } recursive = { workspace = true, optional = true } +smallvec = "1.13.2" sqlparser = { workspace = true } tokio = { workspace = true } -smallvec = "1.13.2" [target.'cfg(target_family = "wasm")'.dependencies] web-time = "1.1.0" From fe74dce9c14c9218085cd83b53a66c50c3102c96 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 12:49:45 +0100 Subject: [PATCH 23/33] fix: doctest --- datafusion/common/src/diagnostic.rs | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 24738d4c5a10..b15b50e704cf 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -32,6 +32,9 @@ use sqlparser::tokenizer::Span; /// Example: /// /// ```rust +/// # use datafusion_common::Diagnostic; +/// # use sqlparser::tokenizer::Span; +/// # let span = Span::empty(); /// let diagnostic = Diagnostic::new_error("Something went wrong", span) /// .with_help("Have you tried turning it on and off again?", Span::empty()); /// ``` @@ -83,21 +86,21 @@ impl Diagnostic { } } - pub fn add_note(&mut self, message: String, span: Span) { - self.notes.push(DiagnosticNote { message, span }); + pub fn add_note(&mut self, message: impl Into, span: Span) { + self.notes.push(DiagnosticNote { message: message.into(), span }); } - pub fn add_help(&mut self, message: String, span: Span) { - self.helps.push(DiagnosticHelp { message, span }); + pub fn add_help(&mut self, message: impl Into, span: Span) { + self.helps.push(DiagnosticHelp { message: message.into(), span }); } - pub fn with_note(mut self, message: String, span: Span) -> Self { - self.add_note(message, span); + pub fn with_note(mut self, message: impl Into, span: Span) -> Self { + self.add_note(message.into(), span); self } - pub fn with_help(mut self, message: String, span: Span) -> Self { - self.add_help(message, span); + pub fn with_help(mut self, message: impl Into, span: Span) -> Self { + self.add_help(message.into(), span); self } } From 829430a6c1bc98e47aa2e140e268f44a27579044 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 12:53:01 +0100 Subject: [PATCH 24/33] chore: configs.md --- docs/source/user-guide/configs.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 1c39064c15d7..e7549d4279b3 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -126,3 +126,4 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes. | From 2772c6b6fc698221ec3eff6d95123a54a8937424 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 13:03:45 +0100 Subject: [PATCH 25/33] fix: test --- datafusion/core/src/execution/context/mod.rs | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index e377dd6297ce..4d04f6f4af09 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1819,6 +1819,7 @@ mod tests { use crate::test_util::{plan_and_collect, populate_csv_partitions}; use arrow_schema::{DataType, TimeUnit}; use std::env; + use std::error::Error; use std::path::PathBuf; use datafusion_common_runtime::SpawnedTask; @@ -2043,10 +2044,16 @@ mod tests { Err(DataFusionError::Plan(_)) )); - assert!(matches!( - ctx.sql("select * from datafusion.public.test").await, - Err(DataFusionError::Plan(_)) - )); + let err = ctx + .sql("select * from datafusion.public.test") + .await + .unwrap_err(); + let err = err + .source() + .and_then(|err| err.downcast_ref::()) + .unwrap(); + + assert!(matches!(err, &DataFusionError::Plan(_))); Ok(()) } From 1b352078ee3e082389745827f4b5a7cce26000f6 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 13:16:41 +0100 Subject: [PATCH 26/33] fix: clippy, by removing smallvec --- datafusion-cli/Cargo.lock | 1 - datafusion/common/Cargo.toml | 1 - datafusion/common/src/error.rs | 4 ++-- datafusion/common/src/spans.rs | 11 +++++------ datafusion/sql/src/planner.rs | 2 +- 5 files changed, 8 insertions(+), 11 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 15c1d8778d0a..f9fc0c78f820 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -1340,7 +1340,6 @@ dependencies = [ "parquet", "paste", "recursive", - "smallvec", "sqlparser", "tokio", "web-time", diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 00c2cfecc6e1..fe6d652be700 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -66,7 +66,6 @@ parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" pyo3 = { version = "0.23.3", optional = true } recursive = { workspace = true, optional = true } -smallvec = "1.13.2" sqlparser = { workspace = true } tokio = { workspace = true } diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index 3f23165f6a73..0b4aab1dc7e1 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -135,7 +135,7 @@ pub enum DataFusionError { /// for end users, to help them understand what went wrong by providing /// human-readable messages, and locations in the source query that relate /// to the error in some way. - Diagnostic(Diagnostic, Box), + Diagnostic(Box, Box), } #[macro_export] @@ -494,7 +494,7 @@ impl DataFusionError { /// Wraps the error with contextual information intended for end users pub fn with_diagnostic(self, diagnostic: Diagnostic) -> Self { - Self::Diagnostic(diagnostic, Box::new(self)) + Self::Diagnostic(Box::new(diagnostic), Box::new(self)) } /// Wraps the error with contextual information intended for end users. diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 544f26b6b924..e744959e98f4 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -use smallvec::SmallVec; use sqlparser::tokenizer::Span; use std::cmp::Ordering; use std::hash::{Hash, Hasher}; @@ -28,15 +27,15 @@ use std::hash::{Hash, Hasher}; #[derive(Debug, Clone)] // Store teh first [`Span`] on the stack because that is by far the most common // case. More will spill onto the heap. -pub struct Spans(pub SmallVec<[Span; 1]>); +pub struct Spans(pub Vec); impl Spans { pub fn new() -> Self { - Spans(SmallVec::new()) + Spans(Vec::new()) } pub fn first_or_empty(&self) -> Span { - self.0.get(0).copied().unwrap_or(Span::empty()) + self.0.first().copied().unwrap_or(Span::empty()) } pub fn get_spans(&self) -> &[Span] { @@ -76,8 +75,8 @@ impl Eq for Spans {} // interfere with the equality and ordering of the entities themselves, since // this is just diagnostics information for the end user. impl PartialOrd for Spans { - fn partial_cmp(&self, _other: &Self) -> Option { - Some(Ordering::Equal) + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) } } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index d1b7647f4aab..1f7f56ccf3a9 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -376,7 +376,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { format!( "column '{}' not found in '{}'", &col.name, - relation.to_string() + relation ), col.spans().first_or_empty(), ) From 243b7883c8f25e14e43cda77c5f2a556c2efa44a Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 13:47:09 +0100 Subject: [PATCH 27/33] fix: update slt --- .../test_files/create_external_table.slt | 14 +++++++------- datafusion/sqllogictest/test_files/distinct_on.slt | 2 +- .../sqllogictest/test_files/information_schema.slt | 2 ++ datafusion/sqllogictest/test_files/insert.slt | 2 +- .../sqllogictest/test_files/insert_to_external.slt | 2 +- datafusion/sqllogictest/test_files/joins.slt | 8 ++++---- datafusion/sqllogictest/test_files/predicates.slt | 2 +- datafusion/sqllogictest/test_files/select.slt | 4 ++-- .../sqllogictest/test_files/type_coercion.slt | 4 ++-- datafusion/sqllogictest/test_files/unnest.slt | 10 +++++----- 10 files changed, 26 insertions(+), 24 deletions(-) diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 6a63ea1cd3e4..8328f9bbd295 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -33,23 +33,23 @@ statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clau CREATE EXTERNAL TABLE t STORED AS CSV # Option value is missing -statement error DataFusion error: SQL error: ParserError\("Expected: string or numeric value, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: string or numeric value, found: \) at Line: 1, Column: 66"\) CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah' # Missing `(` in WITH ORDER clause -statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: c1"\) +statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: c1 at Line: 1, Column: 58"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv' # Missing `)` in WITH ORDER clause -statement error DataFusion error: SQL error: ParserError\("Expected: \), found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: \), found: LOCATION at Line: 1, Column: 62"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv' # Missing `ROW` in WITH HEADER clause -statement error DataFusion error: SQL error: ParserError\("Expected: ROW, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: ROW, found: LOCATION at Line: 1, Column: 51"\) CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' # Missing `BY` in PARTITIONED clause -statement error DataFusion error: SQL error: ParserError\("Expected: BY, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: BY, found: LOCATION at Line: 1, Column: 51"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' # Duplicate `STORED AS` clause @@ -69,11 +69,11 @@ statement error DataFusion error: SQL error: ParserError\("OPTIONS specified mor CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ('k1' 'v1', 'k2' 'v2') OPTIONS ('k3' 'v3') LOCATION 'foo.csv' # With typo error -statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: HEAD"\) +statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: HEAD at Line: 1, Column: 52"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH HEAD ROW LOCATION 'foo.csv'; # Missing `anything` in WITH clause -statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: LOCATION at Line: 1, Column: 52"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH LOCATION 'foo.csv'; # Unrecognized random clause diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index cc0ebf83a843..5aa7c89c15d5 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -153,7 +153,7 @@ b 1 29 -18218 994303988 5983957848665088916 204 9489 3275293996 1485709125918647 c 2 1 18109 2033001162 -6513304855495910254 25 43062 1491205016 5863949479783605708 0.110830784 0.929409733247 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW # can't distinct on * -query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \*"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \* at Line: 1, Column: 21"\) SELECT DISTINCT ON (*) c1 FROM aggregate_test_100 ORDER BY c1 LIMIT 3; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 46618b32d77a..5260d339e5e7 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -257,6 +257,7 @@ datafusion.optimizer.repartition_sorts true datafusion.optimizer.repartition_windows true datafusion.optimizer.skip_failed_rules false datafusion.optimizer.top_down_join_key_reordering true +datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false @@ -351,6 +352,7 @@ datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys +datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes. datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index 804612287246..6b153a285dad 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -431,5 +431,5 @@ NULL 20 500 default_text statement ok drop table test_column_defaults -statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a. +statement error DataFusion error: Schema error: No field named a\. create table test_column_defaults(a int, b int default a+1) diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 2a2aecf2f2cc..ee4dde6248db 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -620,7 +620,7 @@ statement ok drop table test_column_defaults # test invalid default value -statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a. +statement error DataFusion error: Schema error: No field named a\. CREATE EXTERNAL TABLE test_column_defaults( a int, b int default a+1 diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 68426f180d99..58b6350eb4df 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -4064,12 +4064,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int))) 11)----------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t1" }), name: "t1_int", spans: Spans([]) }) # Test CROSS JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int", spans: Spans\(\[\]\) \}\) select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i); @@ -4089,12 +4089,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int))) 11)----------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t2" }), name: "t1_int", spans: Spans([]) }) # Test INNER JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int", spans: Spans\(\[\]\) \}\) select t1_id, t1_name, i from join_t1 t2 inner join lateral (select * from unnest(generate_series(1, t1_int))) as series(i) on(t1_id > i); # Test RIGHT JOIN LATERAL syntax (unsupported) diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 878d7c8a4dfb..d03d9a36aa8e 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -584,7 +584,7 @@ DROP TABLE data_index_bloom_encoding_stats; # String coercion ######## -statement error DataFusion error: SQL error: ParserError\("Expected: a data type name, found: ,"\) +statement error DataFusion error: SQL error: ParserError\("Expected: a data type name, found: , at Line: 1, Column: 30"\) CREATE TABLE t(vendor_id_utf8, vendor_id_dict) AS VALUES (arrow_cast('124', 'Utf8'), arrow_cast('124', 'Dictionary(Int16, Utf8)')), diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index dce685f5c137..7eb74a40d52c 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -339,10 +339,10 @@ NULL 1 statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: EOF"\) VALUES -statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 9"\) VALUES () -statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 13"\) VALUES (1),() statement error DataFusion error: Error during planning: Inconsistent data length across values list: got 2 values in row 1 but expected 1 diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index 43e7c2f7bc25..0900c88c15c0 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -103,11 +103,11 @@ CREATE TABLE orders( ); # union_different_num_columns_error() / UNION -query error DataFusion error: Error during planning: UNION queries have different number of columns: left has 1 columns whereas right has 2 columns +query error DataFusion error: Error during planning: UNION queries have different number of columns SELECT order_id FROM orders UNION SELECT customer_id, o_item_id FROM orders # union_different_num_columns_error() / UNION ALL -query error DataFusion error: Error during planning: UNION queries have different number of columns: left has 1 columns whereas right has 2 columns +query error DataFusion error: Error during planning: UNION queries have different number of columns SELECT order_id FROM orders UNION ALL SELECT customer_id, o_item_id FROM orders # union_with_different_column_names() diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 362f0df11a7c..88c363cd278e 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -295,7 +295,7 @@ query error DataFusion error: Error during planning: unnest\(\) requires exactly select unnest(); ## Unnest empty expression in from clause -query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 22"\) select * from unnest(); @@ -863,11 +863,11 @@ select count(*) from (select unnest(range(0, 100000)) id) t inner join (select u # Test implicit LATERAL support for UNNEST # Issue: https://github.com/apache/datafusion/issues/13659 # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1", spans: Spans\(\[\]\) \}\) select * from unnest_table u, unnest(u.column1); # Test implicit LATERAL support for UNNEST (INNER JOIN) -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1", spans: Spans\(\[\]\) \}\) select * from unnest_table u INNER JOIN unnest(u.column1) AS t(column1) ON u.column3 = t.column1; # Test implicit LATERAL planning for UNNEST @@ -883,7 +883,7 @@ logical_plan 06)------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 07)--------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 08)----------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1", spans: Spans([]) }) # Test implicit LATERAL planning for UNNEST (INNER JOIN) query TT @@ -899,7 +899,7 @@ logical_plan 07)--------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 08)----------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 09)------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1", spans: Spans([]) }) ## Unnest in subquery From e847477fd6880ec9d130350762f324a529095ac4 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Fri, 24 Jan 2025 15:45:02 +0100 Subject: [PATCH 28/33] feat: support all binary expressions --- datafusion/common/src/diagnostic.rs | 10 ++++- .../expr-common/src/type_coercion/binary.rs | 42 ++++++++++++------- datafusion/sql/src/planner.rs | 3 +- 3 files changed, 37 insertions(+), 18 deletions(-) diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index b15b50e704cf..599e36fa443d 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -87,11 +87,17 @@ impl Diagnostic { } pub fn add_note(&mut self, message: impl Into, span: Span) { - self.notes.push(DiagnosticNote { message: message.into(), span }); + self.notes.push(DiagnosticNote { + message: message.into(), + span, + }); } pub fn add_help(&mut self, message: impl Into, span: Span) { - self.helps.push(DiagnosticHelp { message: message.into(), span }); + self.helps.push(DiagnosticHelp { + message: message.into(), + span, + }); } pub fn with_note(mut self, message: impl Into, span: Span) -> Self { diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 88744c32cef9..b77586bd0af6 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -104,11 +104,23 @@ impl<'a> BinaryTypeCoercer<'a> { self.rhs_spans = spans; } + fn span(&self) -> Span { + Span::union_iter( + [ + self.lhs_spans.first_or_empty(), + self.rhs_spans.first_or_empty(), + ] + .iter() + .copied() + .filter(|span| span != &Span::empty()), + ) + } + /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` fn signature(&'a self) -> Result { use arrow::datatypes::DataType::*; use Operator::*; - match self.op { + let result = match self.op { Eq | NotEq | Lt | @@ -228,21 +240,23 @@ impl<'a> BinaryTypeCoercer<'a> { } else { plan_err!( "Cannot coerce arithmetic expression {} {} {} to valid types", self.lhs, self.op, self.rhs - ).map_err(|err| { - let binary_expr_span = Span::union_iter( - [ - self.lhs_spans.first_or_empty(), - self.rhs_spans.first_or_empty() - ].iter().copied().filter(|span| span != &Span::empty()) - ); - let diagnostic = Diagnostic::new_error("expressions have incompatible types", binary_expr_span) - .with_note(format!("has type {}", self.lhs), self.lhs_spans.first_or_empty()) - .with_note(format!("has type {}", self.rhs), self.rhs_spans.first_or_empty()); - err.with_diagnostic(diagnostic) - }) + ) } } - } + }; + result.map_err(|err| { + let diagnostic = + Diagnostic::new_error("expressions have incompatible types", self.span()) + .with_note( + format!("has type {}", self.lhs), + self.lhs_spans.first_or_empty(), + ) + .with_note( + format!("has type {}", self.rhs), + self.rhs_spans.first_or_empty(), + ); + err.with_diagnostic(diagnostic) + }) } /// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 1f7f56ccf3a9..543d58557bb7 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -375,8 +375,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Diagnostic::new_error( format!( "column '{}' not found in '{}'", - &col.name, - relation + &col.name, relation ), col.spans().first_or_empty(), ) From 60a9f8b7ab141dda638207e1031736b09dd45a3a Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Mon, 27 Jan 2025 13:41:39 +0100 Subject: [PATCH 29/33] refactor: move diagnostic tests to datafusion-sql --- .../tests/cases/diagnostic.rs} | 133 +++++++++--------- datafusion/sql/tests/cases/mod.rs | 1 + 2 files changed, 65 insertions(+), 69 deletions(-) rename datafusion/{core/tests/diagnostics.rs => sql/tests/cases/diagnostic.rs} (60%) diff --git a/datafusion/core/tests/diagnostics.rs b/datafusion/sql/tests/cases/diagnostic.rs similarity index 60% rename from datafusion/core/tests/diagnostics.rs rename to datafusion/sql/tests/cases/diagnostic.rs index 258390acc7ea..9d7f59806aa9 100644 --- a/datafusion/core/tests/diagnostics.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -17,32 +17,34 @@ use std::collections::HashMap; -use arrow_schema::{DataType, Field, Schema}; -use datafusion::error::Result; -use datafusion::prelude::*; -use datafusion_common::Diagnostic; +use datafusion_common::{Diagnostic, Result}; +use datafusion_sql::planner::{ParserOptions, SqlToRel}; use regex::Regex; -use sqlparser::tokenizer::{Location, Span}; - -async fn cars_query(sql: &'static str) -> Diagnostic { - let config = - SessionConfig::new().set_bool("datafusion.sql_parser.collect_spans", true); - let ctx = SessionContext::new_with_config(config); - let csv_path = "../../datafusion/core/tests/data/cars.csv".to_string(); - let schema = Schema::new(vec![ - Field::new("car", DataType::Utf8, false), - Field::new("speed", DataType::Float32, false), - Field::new("time", DataType::Utf8, false), - ]); - let read_options = CsvReadOptions::default().has_header(true).schema(&schema); - ctx.register_csv("cars", &csv_path, read_options) - .await - .expect("error registering cars.csv"); - match ctx.sql(sql).await { +use sqlparser::{dialect::GenericDialect, parser::Parser, tokenizer::{Location, Span}}; + +use crate::{MockContextProvider, MockSessionState}; + +fn do_query(sql: &'static str) -> Diagnostic { + let dialect = GenericDialect {}; + let statement = Parser::new(&dialect) + .try_with_sql(sql) + .expect("unable to create parser") + .parse_statement() + .expect("unable to parse query"); + let options = ParserOptions { + collect_spans: true, + ..ParserOptions::default() + }; + let state = MockSessionState::default(); + let context = MockContextProvider { state }; + let sql_to_rel = SqlToRel::new_with_options(&context, options); + match sql_to_rel.sql_statement_to_plan(statement) { Ok(_) => panic!("expected error"), - Err(err) => match err.diagnostic() { - Some(diag) => diag.clone(), - None => panic!("expected diagnostic"), + Err(err) => { + match err.diagnostic() { + Some(diag) => diag.clone(), + None => panic!("expected diagnostic"), + } }, } } @@ -129,45 +131,41 @@ fn get_spans(query: &'static str) -> HashMap { spans } -#[cfg(test)] -#[tokio::test] -pub async fn test_table_not_found() -> Result<()> { - let query = "SELECT * FROM /*a*/carz/*a*/"; +#[test] +fn test_table_not_found() -> Result<()> { + let query = "SELECT * FROM /*a*/personx/*a*/"; let spans = get_spans(query); - let diag = cars_query(query).await; - assert_eq!(diag.message, "table 'carz' not found"); + let diag = do_query(query); + assert_eq!(diag.message, "table 'personx' not found"); assert_eq!(diag.span, spans["a"]); Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_unqualified_column_not_found() -> Result<()> { - let query = "SELECT /*a*/speedz/*a*/ FROM cars"; +#[test] +fn test_unqualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/first_namex/*a*/ FROM person"; let spans = get_spans(query); - let diag = cars_query(query).await; - assert_eq!(diag.message, "column 'speedz' not found"); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_namex' not found"); assert_eq!(diag.span, spans["a"]); Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_qualified_column_not_found() -> Result<()> { - let query = "SELECT /*a*/cars.speedz/*a*/ FROM cars"; +#[test] +fn test_qualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/person.first_namex/*a*/ FROM person"; let spans = get_spans(query); - let diag = cars_query(query).await; - assert_eq!(diag.message, "column 'speedz' not found in 'cars'"); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_namex' not found in 'person'"); assert_eq!(diag.span, spans["a"]); Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_union_wrong_number_of_columns() -> Result<()> { - let query = "/*whole+left*/SELECT speed FROM cars/*left*/ UNION ALL /*right*/SELECT speed, time FROM cars/*right+whole*/"; +#[test] +fn test_union_wrong_number_of_columns() -> Result<()> { + let query = "/*whole+left*/SELECT first_name FROM person/*left*/ UNION ALL /*right*/SELECT first_name, last_name FROM person/*right+whole*/"; let spans = get_spans(query); - let diag = cars_query(query).await; + let diag = do_query(query); assert_eq!( diag.message, "UNION queries have different number of columns" @@ -180,52 +178,49 @@ pub async fn test_union_wrong_number_of_columns() -> Result<()> { Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_missing_non_aggregate_in_group_by() -> Result<()> { - let query = "SELECT car, /*a*/speed/*a*/ FROM cars GROUP BY car"; +#[test] +fn test_missing_non_aggregate_in_group_by() -> Result<()> { + let query = "SELECT id, /*a*/first_name/*a*/ FROM person GROUP BY id"; let spans = get_spans(query); - let diag = cars_query(query).await; + let diag = do_query(query); assert_eq!( diag.message, - "'cars.speed' must appear in GROUP BY clause because it's not an aggregate expression" + "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" ); assert_eq!(diag.span, spans["a"]); - assert_eq!(diag.helps[0].message, "add 'cars.speed' to GROUP BY clause"); + assert_eq!(diag.helps[0].message, "add 'person.first_name' to GROUP BY clause"); Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_ambiguous_reference() -> Result<()> { - let query = "SELECT /*a*/car/*a*/ FROM cars a, cars b"; +#[test] +fn test_ambiguous_reference() -> Result<()> { + let query = "SELECT /*a*/first_name/*a*/ FROM person a, person b"; let spans = get_spans(query); - let diag = cars_query(query).await; - assert_eq!(diag.message, "column 'car' is ambiguous"); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_name' is ambiguous"); assert_eq!(diag.span, spans["a"]); assert_eq!( diag.notes[0].message, - "possible reference to 'car' in table 'a'" + "possible reference to 'first_name' in table 'a'" ); assert_eq!( diag.notes[1].message, - "possible reference to 'car' in table 'b'" + "possible reference to 'first_name' in table 'b'" ); Ok(()) } -#[cfg(test)] -#[tokio::test] -pub async fn test_incompatible_types_binary_arithmetic() -> Result<()> { +#[test] +fn test_incompatible_types_binary_arithmetic() -> Result<()> { let query = - "SELECT /*whole+left*/car/*left*/ + /*right*/speed/*right+whole*/ FROM cars"; + "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person"; let spans = get_spans(query); - let diag = cars_query(query).await; + let diag = do_query(query); assert_eq!(diag.message, "expressions have incompatible types"); assert_eq!(diag.span, spans["whole"]); - assert_eq!(diag.notes[0].message, "has type Utf8"); + assert_eq!(diag.notes[0].message, "has type UInt32"); assert_eq!(diag.notes[0].span, spans["left"]); - assert_eq!(diag.notes[1].message, "has type Float32"); + assert_eq!(diag.notes[1].message, "has type Utf8"); assert_eq!(diag.notes[1].span, spans["right"]); Ok(()) } diff --git a/datafusion/sql/tests/cases/mod.rs b/datafusion/sql/tests/cases/mod.rs index fc4c59cc88d8..b2469fb19e1e 100644 --- a/datafusion/sql/tests/cases/mod.rs +++ b/datafusion/sql/tests/cases/mod.rs @@ -16,3 +16,4 @@ // under the License. mod plan_to_sql; +mod diagnostic; From f178db022b0f6bffc59289d5461f9c345aab1e40 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Mon, 27 Jan 2025 13:54:04 +0100 Subject: [PATCH 30/33] chore: format --- datafusion/sql/tests/cases/diagnostic.rs | 19 ++++++++++++------- datafusion/sql/tests/cases/mod.rs | 2 +- 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index 9d7f59806aa9..e3fdd033ec22 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -20,7 +20,11 @@ use std::collections::HashMap; use datafusion_common::{Diagnostic, Result}; use datafusion_sql::planner::{ParserOptions, SqlToRel}; use regex::Regex; -use sqlparser::{dialect::GenericDialect, parser::Parser, tokenizer::{Location, Span}}; +use sqlparser::{ + dialect::GenericDialect, + parser::Parser, + tokenizer::{Location, Span}, +}; use crate::{MockContextProvider, MockSessionState}; @@ -40,11 +44,9 @@ fn do_query(sql: &'static str) -> Diagnostic { let sql_to_rel = SqlToRel::new_with_options(&context, options); match sql_to_rel.sql_statement_to_plan(statement) { Ok(_) => panic!("expected error"), - Err(err) => { - match err.diagnostic() { - Some(diag) => diag.clone(), - None => panic!("expected diagnostic"), - } + Err(err) => match err.diagnostic() { + Some(diag) => diag.clone(), + None => panic!("expected diagnostic"), }, } } @@ -188,7 +190,10 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" ); assert_eq!(diag.span, spans["a"]); - assert_eq!(diag.helps[0].message, "add 'person.first_name' to GROUP BY clause"); + assert_eq!( + diag.helps[0].message, + "add 'person.first_name' to GROUP BY clause" + ); Ok(()) } diff --git a/datafusion/sql/tests/cases/mod.rs b/datafusion/sql/tests/cases/mod.rs index b2469fb19e1e..48574984c0ca 100644 --- a/datafusion/sql/tests/cases/mod.rs +++ b/datafusion/sql/tests/cases/mod.rs @@ -15,5 +15,5 @@ // specific language governing permissions and limitations // under the License. -mod plan_to_sql; mod diagnostic; +mod plan_to_sql; From 51dd141426aacc8561852eca93d7a0bcdb8f3f06 Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Mon, 27 Jan 2025 15:22:58 +0100 Subject: [PATCH 31/33] fix: tests --- datafusion/expr-common/Cargo.toml | 2 +- datafusion/sql/tests/sql_integration.rs | 14 +++++++++----- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index 86d29b1c2e02..7f2183dc2892 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -40,5 +40,5 @@ path = "src/lib.rs" arrow = { workspace = true } datafusion-common = { workspace = true } itertools = { workspace = true } -sqlparser = { workspace = true } paste = "^1.0" +sqlparser = { workspace = true } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index 360683cd71d5..b9502c152004 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -3632,10 +3632,10 @@ fn test_prepare_statement_to_plan_panic_prepare_wrong_syntax() { // param is not number following the $ sign // panic due to error returned from the parser let sql = "PREPARE AS SELECT id, age FROM person WHERE age = $foo"; - assert_eq!( - logical_plan(sql).unwrap_err().strip_backtrace(), - "SQL error: ParserError(\"Expected: AS, found: SELECT\")" - ) + assert!(logical_plan(sql) + .unwrap_err() + .strip_backtrace() + .contains("Expected: AS, found: SELECT")) } #[test] @@ -3673,7 +3673,7 @@ fn test_non_prepare_statement_should_infer_types() { #[test] #[should_panic( - expected = "value: SQL(ParserError(\"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: $1\"" + expected = "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: $1" )] fn test_prepare_statement_to_plan_panic_is_param() { let sql = "PREPARE my_plan(INT) AS SELECT id, age FROM person WHERE age is $1"; @@ -4408,6 +4408,10 @@ fn plan_create_index() { } fn assert_field_not_found(err: DataFusionError, name: &str) { + let err = match err { + DataFusionError::Diagnostic(_, err) => *err, + err => err, + }; match err { DataFusionError::SchemaError { .. } => { let msg = format!("{err}"); From b5e326cdf156db470a5e5ec34833812345af430f Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Tue, 28 Jan 2025 13:26:59 +0100 Subject: [PATCH 32/33] feat: pr feedback --- datafusion/common/src/column.rs | 34 ++++- datafusion/common/src/diagnostic.rs | 66 +++++++-- datafusion/common/src/lib.rs | 2 +- datafusion/common/src/spans.rs | 132 +++++++++++++++++- datafusion/expr-common/Cargo.toml | 1 - .../expr-common/src/type_coercion/binary.rs | 36 ++--- datafusion/expr/src/expr.rs | 3 + datafusion/expr/src/expr_rewriter/mod.rs | 9 +- datafusion/sql/src/expr/identifier.rs | 24 +++- datafusion/sql/src/expr/mod.rs | 3 +- datafusion/sql/src/planner.rs | 4 +- datafusion/sql/src/relation/mod.rs | 4 +- datafusion/sql/src/set_expr.rs | 30 ++-- datafusion/sql/src/utils.rs | 7 +- datafusion/sql/tests/cases/diagnostic.rs | 30 ++-- datafusion/sqllogictest/test_files/joins.slt | 8 +- datafusion/sqllogictest/test_files/unnest.slt | 8 +- 17 files changed, 288 insertions(+), 113 deletions(-) diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index 2382ea5d4d0d..9973a72e7bc0 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,27 +17,35 @@ //! Column -use arrow_schema::{Field, FieldRef}; -use sqlparser::tokenizer::Span; - use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; +use arrow_schema::{Field, FieldRef}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; use std::str::FromStr; /// A named reference to a qualified field in a schema. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { /// relation/table reference. pub relation: Option, /// field/column name. pub name: String, + /// Original source code location, if known pub spans: Spans, } +impl fmt::Debug for Column { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Column") + .field("relation", &self.relation) + .field("name", &self.name) + .finish() + } +} + impl Column { /// Create Column from optional qualifier and name. The optional qualifier, if present, /// will be parsed and normalized by default. @@ -255,7 +263,7 @@ impl Column { .map_err(|err| { let mut diagnostic = Diagnostic::new_error( format!("column '{}' is ambiguous", &self.name), - self.spans().first_or_empty(), + self.spans().first(), ); // TODO If [`DFSchema`] had spans, we could show the // user which columns are candidates, or which table @@ -270,7 +278,7 @@ impl Column { "possible reference to '{}' in table '{}'", &self.name, table ), - Span::empty(), + None, ); } err.with_diagnostic(diagnostic) @@ -289,18 +297,32 @@ impl Column { }) } + /// Returns a reference to the set of locations in the SQL query where this + /// column appears, if known. pub fn spans(&self) -> &Spans { &self.spans } + /// Returns a mutable reference to the set of locations in the SQL query + /// where this column appears, if known. pub fn spans_mut(&mut self) -> &mut Spans { &mut self.spans } + /// Replaces the set of locations in the SQL query where this column + /// appears, if known. pub fn with_spans(mut self, spans: Spans) -> Self { self.spans = spans; self } + + /// Qualifies the column with the given table reference. + pub fn with_relation(&self, relation: TableReference) -> Self { + Self { + relation: Some(relation), + ..self.clone() + } + } } impl From<&str> for Column { diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 599e36fa443d..7b635a574b00 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -use sqlparser::tokenizer::Span; +use crate::Span; /// Additional contextual information intended for end users, to help them /// understand what went wrong by providing human-readable messages, and @@ -32,33 +32,53 @@ use sqlparser::tokenizer::Span; /// Example: /// /// ```rust -/// # use datafusion_common::Diagnostic; -/// # use sqlparser::tokenizer::Span; -/// # let span = Span::empty(); +/// # use datafusion_common::{Location, Span, Diagnostic}; +/// let span = Some(Span { +/// start: Location{ line: 2, column: 1 }, +/// end: Location{ line: 4, column: 15 } +/// }); /// let diagnostic = Diagnostic::new_error("Something went wrong", span) -/// .with_help("Have you tried turning it on and off again?", Span::empty()); +/// .with_help("Have you tried turning it on and off again?", None); /// ``` #[derive(Debug, Clone)] pub struct Diagnostic { pub kind: DiagnosticKind, pub message: String, - pub span: Span, + pub span: Option, pub notes: Vec, pub helps: Vec, } +/// A note enriches a [`Diagnostic`] with extra information, possibly referring +/// to different locations in the original SQL query, that helps contextualize +/// the error and helps the end user understand why it occurred. +/// +/// Example: +/// SELECT id, name FROM users GROUP BY id +/// Note: ^^^^ 'name' is not in the GROUP BY clause #[derive(Debug, Clone)] pub struct DiagnosticNote { pub message: String, - pub span: Span, + pub span: Option, } +/// A "help" enriches a [`Diagnostic`] with extra information, possibly +/// referring to different locations in the original SQL query, that helps the +/// user understand how they might fix the error or warning. +/// +/// Example: +/// SELECT id, name FROM users GROUP BY id +/// Help: Add 'name' here ^^^^ #[derive(Debug, Clone)] pub struct DiagnosticHelp { pub message: String, - pub span: Span, + pub span: Option, } +/// A [`Diagnostic`] can either be a hard error that prevents the query from +/// being planned and executed, or a warning that indicates potential issues, +/// performance problems, or causes for unexpected results, but is non-fatal. +/// This enum expresses these two possibilities. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum DiagnosticKind { Error, @@ -66,7 +86,11 @@ pub enum DiagnosticKind { } impl Diagnostic { - pub fn new_error(message: impl Into, span: Span) -> Self { + /// Creates a new [`Diagnostic`] for a fatal error that prevents the SQL + /// query from being planned and executed. Optionally takes in a [`Span`] to + /// describe the location in the source code that caused the error, should + /// be provided when available. + pub fn new_error(message: impl Into, span: Option) -> Self { Self { kind: DiagnosticKind::Error, message: message.into(), @@ -76,7 +100,11 @@ impl Diagnostic { } } - pub fn new_warning(message: impl Into, span: Span) -> Self { + /// Creates a new [`Diagnostic`] for a NON-fatal warning, such as a + /// performance problem, or possible cause for undesired results. Optionally + /// takes in a [`Span`] to describe the location in the source code that + /// caused the error, should be provided when available. + pub fn new_warning(message: impl Into, span: Option) -> Self { Self { kind: DiagnosticKind::Warning, message: message.into(), @@ -86,26 +114,36 @@ impl Diagnostic { } } - pub fn add_note(&mut self, message: impl Into, span: Span) { + /// Adds a "note" to the [`Diagnostic`], which can have zero or many. A "note" + /// helps contextualize the error and helps the end user understand why it + /// occurred. It can refer to an arbitrary location in the SQL query, or to + /// no location. + pub fn add_note(&mut self, message: impl Into, span: Option) { self.notes.push(DiagnosticNote { message: message.into(), span, }); } - pub fn add_help(&mut self, message: impl Into, span: Span) { + /// Adds a "help" to the [`Diagnostic`], which can have zero or many. A + /// "help" helps the user understand how they might fix the error or + /// warning. It can refer to an arbitrary location in the SQL query, or to + /// no location. + pub fn add_help(&mut self, message: impl Into, span: Option) { self.helps.push(DiagnosticHelp { message: message.into(), span, }); } - pub fn with_note(mut self, message: impl Into, span: Span) -> Self { + /// Like [`Diagnostic::add_note`], but returns `self` to allow chaining. + pub fn with_note(mut self, message: impl Into, span: Option) -> Self { self.add_note(message.into(), span); self } - pub fn with_help(mut self, message: impl Into, span: Span) -> Self { + /// Like [`Diagnostic::add_help`], but returns `self` to allow chaining. + pub fn with_help(mut self, message: impl Into, span: Option) -> Self { self.add_help(message.into(), span); self } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index e2c059a762fe..1ad2a5f0cec3 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -75,7 +75,7 @@ pub use join_type::{JoinConstraint, JoinSide, JoinType}; pub use param_value::ParamValues; pub use scalar::{ScalarType, ScalarValue}; pub use schema_reference::SchemaReference; -pub use spans::Spans; +pub use spans::{Location, Span, Spans}; pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index e744959e98f4..0d137cb61e77 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -14,12 +14,126 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. - -use sqlparser::tokenizer::Span; -use std::cmp::Ordering; +use std::cmp::{self, Ordering}; +use std::fmt; use std::hash::{Hash, Hasher}; -// / A collection of [`Span`], meant to be used as a field of entities whose +/// Represents a location, determined by a line and a column number, in the +/// original SQL query. +#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)] +pub struct Location { + /// Line number, starting from 1. + /// + /// Note: Line 0 is used for empty spans + pub line: u64, + /// Line column, starting from 1. + /// + /// Note: Column 0 is used for empty spans + pub column: u64, +} + +impl fmt::Debug for Location { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Location({},{})", self.line, self.column) + } +} + +impl From for Location { + fn from(value: sqlparser::tokenizer::Location) -> Self { + Self { + line: value.line, + column: value.column, + } + } +} + +/// Represents an interval of characters in the original SQL query. +#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)] +pub struct Span { + pub start: Location, + pub end: Location, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Span({:?}..{:?})", self.start, self.end) + } +} + +impl Span { + /// Creates a new [`Span`] from a start and an end [`Location`]. + pub fn new(start: Location, end: Location) -> Self { + Self { start, end } + } + + /// Convert a [`Span`](sqlparser::tokenizer::Span) from the parser, into a + /// DataFusion [`Span`]. If the input span is empty (line 0 column 0, to + /// line 0 column 0), then [`None`] is returned. + pub fn try_from_sqlparser_span(span: sqlparser::tokenizer::Span) -> Option { + if span == sqlparser::tokenizer::Span::empty() { + None + } else { + Some(Span { + start: span.start.into(), + end: span.end.into(), + }) + } + } + + /// Returns the smallest Span that contains both `self` and `other` + /// + /// # Examples + /// ``` + /// # use sqlparser::tokenizer::{Span, Location}; + /// // line 1, column1 -> line 2, column 5 + /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5)); + /// // line 2, column 3 -> line 3, column 7 + /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7)); + /// // Union of the two is the min/max of the two spans + /// // line 1, column 1 -> line 3, column 7 + /// let union = span1.union(&span2); + /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7))); + /// ``` + pub fn union(&self, other: &Span) -> Span { + Span { + start: cmp::min(self.start, other.start), + end: cmp::max(self.end, other.end), + } + } + + /// Same as [Span::union] for `Option`. + /// + /// If `other` is `None`, `self` is returned. + pub fn union_opt(&self, other: &Option) -> Span { + match other { + Some(other) => self.union(other), + None => *self, + } + } + + /// Return the [Span::union] of all spans in the iterator. + /// + /// If the iterator is empty, [`None`] is returned. + /// + /// # Example + /// ``` + /// # use sqlparser::tokenizer::{Span, Location}; + /// let spans = vec![ + /// Span::new(Location::new(1, 1), Location::new(2, 5)), + /// Span::new(Location::new(2, 3), Location::new(3, 7)), + /// Span::new(Location::new(3, 1), Location::new(4, 2)), + /// ]; + /// // line 1, column 1 -> line 4, column 2 + /// assert_eq!( + /// Span::union_iter(spans), + /// Span::new(Location::new(1, 1), Location::new(4, 2)) + /// ); + pub fn union_iter>(iter: I) -> Option { + iter.into_iter().reduce(|acc, item| acc.union(&item)) + } +} + +/// A collection of [`Span`], meant to be used as a field of entities whose /// location in the original SQL query is desired to be tracked. Sometimes an /// entity can have multiple spans. e.g. if you want to track the position of /// the column a that comes from SELECT 1 AS a UNION ALL SELECT 2 AS a you'll @@ -30,22 +144,28 @@ use std::hash::{Hash, Hasher}; pub struct Spans(pub Vec); impl Spans { + /// Creates a new empty [`Spans`] with no [`Span`]. pub fn new() -> Self { Spans(Vec::new()) } - pub fn first_or_empty(&self) -> Span { - self.0.first().copied().unwrap_or(Span::empty()) + /// Returns the first [`Span`], if any. This is useful when you know that + /// there's gonna be only one [`Span`] at most. + pub fn first(&self) -> Option { + self.0.first().copied() } + /// Returns a slice of the [`Span`]s. pub fn get_spans(&self) -> &[Span] { &self.0 } + /// Adds a [`Span`] to the collection. pub fn add_span(&mut self, span: Span) { self.0.push(span); } + /// Iterates over the [`Span`]s. pub fn iter(&self) -> impl Iterator { self.0.iter() } diff --git a/datafusion/expr-common/Cargo.toml b/datafusion/expr-common/Cargo.toml index 7f2183dc2892..109d8e0b89a6 100644 --- a/datafusion/expr-common/Cargo.toml +++ b/datafusion/expr-common/Cargo.toml @@ -41,4 +41,3 @@ arrow = { workspace = true } datafusion-common = { workspace = true } itertools = { workspace = true } paste = "^1.0" -sqlparser = { workspace = true } diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index b77586bd0af6..7f1c9e321a65 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -31,10 +31,9 @@ use arrow::datatypes::{ use datafusion_common::types::NativeType; use datafusion_common::{ exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, - Diagnostic, Result, Spans, + Diagnostic, Result, Span, Spans, }; use itertools::Itertools; -use sqlparser::tokenizer::Span; /// The type signature of an instantiation of binary operator expression such as /// `lhs + rhs` @@ -70,6 +69,8 @@ impl Signature { } } +/// Provides type information about a binary expression, coercing different +/// input types into a sensible output type. pub struct BinaryTypeCoercer<'a> { lhs: &'a DataType, op: &'a Operator, @@ -81,6 +82,8 @@ pub struct BinaryTypeCoercer<'a> { } impl<'a> BinaryTypeCoercer<'a> { + /// Creates a new [`BinaryTypeCoercer`], for reasoning about the input + /// and output types of a binary expression. pub fn new(lhs: &'a DataType, op: &'a Operator, rhs: &'a DataType) -> Self { Self { lhs, @@ -92,27 +95,30 @@ impl<'a> BinaryTypeCoercer<'a> { } } + /// Sets the spans information for the left side of the binary expression, + /// so better diagnostics can be provided in case of errors. pub fn set_lhs_spans(&mut self, spans: Spans) { self.lhs_spans = spans; } + /// Sets the spans information for the operator of the binary expression, so + /// better diagnostics can be provided in case of errors. pub fn set_op_spans(&mut self, spans: Spans) { self.op_spans = spans; } + /// Sets the spans information for the right side of the binary expression, + /// so better diagnostics can be provided in case of errors. pub fn set_rhs_spans(&mut self, spans: Spans) { self.rhs_spans = spans; } - fn span(&self) -> Span { + fn span(&self) -> Option { Span::union_iter( - [ - self.lhs_spans.first_or_empty(), - self.rhs_spans.first_or_empty(), - ] - .iter() - .copied() - .filter(|span| span != &Span::empty()), + [self.lhs_spans.first(), self.rhs_spans.first()] + .iter() + .copied() + .flatten(), ) } @@ -247,14 +253,8 @@ impl<'a> BinaryTypeCoercer<'a> { result.map_err(|err| { let diagnostic = Diagnostic::new_error("expressions have incompatible types", self.span()) - .with_note( - format!("has type {}", self.lhs), - self.lhs_spans.first_or_empty(), - ) - .with_note( - format!("has type {}", self.rhs), - self.rhs_spans.first_or_empty(), - ); + .with_note(format!("has type {}", self.lhs), self.lhs_spans.first()) + .with_note(format!("has type {}", self.rhs), self.rhs_spans.first()); err.with_diagnostic(diagnostic) }) } diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 57eec81f419c..e2af1a7c99ea 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -1668,6 +1668,9 @@ impl Expr { } } + /// Returns a reference to the set of locations in the SQL query where this + /// expression appears, if known. [`None`] is returned if the expression + /// type doesn't support tracking locations yet. pub fn spans(&self) -> Option<&Spans> { match self { Expr::Column(col) => Some(&col.spans), diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 77cfaf5295f1..0b1b10b4a43e 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -178,14 +178,7 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { - relation: _, - name, - spans, - }) => Ok( - Column::new(Some::(subqry_alias.into()), name) - .with_spans(spans.clone()), - ), + Expr::Column(col) => Ok(col.with_relation(subqry_alias.into())), _ => { let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 7e4a9763febb..130cdf083da3 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -18,12 +18,11 @@ use arrow_schema::Field; use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, - DataFusionError, Result, TableReference, + DataFusionError, Result, Span, TableReference, }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; -use sqlparser::tokenizer::Span; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_expr::UNNAMED_TABLE; @@ -62,7 +61,9 @@ impl SqlToRel<'_, S> { normalize_ident, ); if self.options.collect_spans { - column.spans_mut().add_span(id_span); + if let Some(span) = Span::try_from_sqlparser_span(id_span) { + column.spans_mut().add_span(span); + } } return Ok(Expr::Column(column)); } @@ -83,7 +84,9 @@ impl SqlToRel<'_, S> { // Default case let mut column = Column::new_unqualified(normalize_ident); if self.options.collect_spans { - column.spans_mut().add_span(id_span); + if let Some(span) = Span::try_from_sqlparser_span(id_span) { + column.spans_mut().add_span(span); + } } Ok(Expr::Column(column)) } @@ -99,7 +102,10 @@ impl SqlToRel<'_, S> { return internal_err!("Not a compound identifier: {ids:?}"); } - let ids_span = Span::union_iter(ids.iter().map(|id| id.span)); + let ids_span = Span::union_iter( + ids.iter() + .filter_map(|id| Span::try_from_sqlparser_span(id.span)), + ); if ids[0].value.starts_with('@') { let var_names: Vec<_> = ids @@ -146,7 +152,9 @@ impl SqlToRel<'_, S> { Some((field, qualifier, _nested_names)) => { let mut column = Column::from((qualifier, field)); if self.options.collect_spans { - column.spans_mut().add_span(ids_span); + if let Some(span) = ids_span { + column.spans_mut().add_span(span); + } } Ok(Expr::Column(column)) } @@ -193,7 +201,9 @@ impl SqlToRel<'_, S> { let (relation, column_name) = form_identifier(s).unwrap(); let mut column = Column::new(relation, column_name); if self.options.collect_spans { - column.spans_mut().add_span(ids_span); + if let Some(span) = ids_span { + column.spans_mut().add_span(span); + } } Ok(Expr::Column(column)) } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index d34ac787c827..eb62903a0493 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -166,8 +166,7 @@ impl SqlToRel<'_, S> { planner_context: &mut PlannerContext, ) -> Result { // The location of the original SQL expression in the source code - let mut expr = - self.sql_expr_to_logical_expr(sql.clone(), schema, planner_context)?; + let mut expr = self.sql_expr_to_logical_expr(sql, schema, planner_context)?; expr = self.rewrite_partial_qualifier(expr, schema); self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr))?; let (expr, _) = expr.infer_placeholder_types(schema)?; diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 543d58557bb7..f22ff6d94fc2 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -377,12 +377,12 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { "column '{}' not found in '{}'", &col.name, relation ), - col.spans().first_or_empty(), + col.spans().first(), ) } else { Diagnostic::new_error( format!("column '{}' not found", &col.name), - col.spans().first_or_empty(), + col.spans().first(), ) }; err.with_diagnostic(diagnostic) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 3ed4787eeffa..800dd151a124 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -21,7 +21,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ - not_impl_err, plan_err, DFSchema, Diagnostic, Result, TableReference, + not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, TableReference, }; use datafusion_expr::builder::subquery_alias; use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder}; @@ -92,7 +92,7 @@ impl SqlToRel<'_, S> { (None, Err(e)) => { let e = e.with_diagnostic(Diagnostic::new_error( format!("table '{}' not found", table_ref), - relation_span, + Span::try_from_sqlparser_span(relation_span), )); Err(e) } diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 2b7f160ee828..290c0174784a 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -16,12 +16,9 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result}; +use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result, Span}; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::{ - ast::{SetExpr, SetOperator, SetQuantifier, Spanned}, - tokenizer::Span, -}; +use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -30,7 +27,7 @@ impl SqlToRel<'_, S> { set_expr: SetExpr, planner_context: &mut PlannerContext, ) -> Result { - let set_expr_span = set_expr.span(); + let set_expr_span = Span::try_from_sqlparser_span(set_expr.span()); match set_expr { SetExpr::Select(s) => self.select_to_plan(*s, vec![], planner_context), SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context), @@ -40,13 +37,14 @@ impl SqlToRel<'_, S> { right, set_quantifier, } => { - let left_plan = self.set_expr_to_plan(*left.clone(), planner_context)?; - let right_plan = - self.set_expr_to_plan(*right.clone(), planner_context)?; + let left_span = Span::try_from_sqlparser_span(left.span()); + let right_span = Span::try_from_sqlparser_span(right.span()); + let left_plan = self.set_expr_to_plan(*left, planner_context)?; + let right_plan = self.set_expr_to_plan(*right, planner_context)?; self.validate_set_expr_num_of_columns( op, - &left, - &right, + left_span, + right_span, &left_plan, &right_plan, set_expr_span, @@ -77,11 +75,11 @@ impl SqlToRel<'_, S> { fn validate_set_expr_num_of_columns( &self, op: SetOperator, - left: &SetExpr, - right: &SetExpr, + left_span: Option, + right_span: Option, left_plan: &LogicalPlan, right_plan: &LogicalPlan, - set_expr_span: Span, + set_expr_span: Option, ) -> Result<()> { if left_plan.schema().fields().len() == right_plan.schema().fields().len() { return Ok(()); @@ -95,14 +93,14 @@ impl SqlToRel<'_, S> { ) .with_note( format!("this side has {} fields", left_plan.schema().fields().len()), - left.span(), + left_span, ) .with_note( format!( "this side has {} fields", right_plan.schema().fields().len() ), - right.span(), + right_span, ), ) }) diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 4953f0fed91e..ab3e75a960a0 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -38,7 +38,6 @@ use datafusion_expr::{ use indexmap::IndexMap; use sqlparser::ast::{Ident, Value}; -use sqlparser::tokenizer::Span; /// Make a best-effort attempt at resolving all columns in the expression tree pub(crate) fn resolve_columns(expr: &Expr, plan: &LogicalPlan) -> Result { @@ -166,11 +165,9 @@ fn check_column_satisfies_expr( .map_err(|err| { let diagnostic = Diagnostic::new_error( purpose.diagnostic_message(expr), - expr.spans() - .map(|spans| spans.first_or_empty()) - .unwrap_or(Span::empty()), + expr.spans().and_then(|spans| spans.first()), ) - .with_help(format!("add '{expr}' to GROUP BY clause"), Span::empty()); + .with_help(format!("add '{expr}' to GROUP BY clause"), None); err.with_diagnostic(diagnostic) }); } diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs index e3fdd033ec22..55d3a953a728 100644 --- a/datafusion/sql/tests/cases/diagnostic.rs +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -17,14 +17,10 @@ use std::collections::HashMap; -use datafusion_common::{Diagnostic, Result}; +use datafusion_common::{Diagnostic, Location, Result, Span}; use datafusion_sql::planner::{ParserOptions, SqlToRel}; use regex::Regex; -use sqlparser::{ - dialect::GenericDialect, - parser::Parser, - tokenizer::{Location, Span}, -}; +use sqlparser::{dialect::GenericDialect, parser::Parser}; use crate::{MockContextProvider, MockSessionState}; @@ -139,7 +135,7 @@ fn test_table_not_found() -> Result<()> { let spans = get_spans(query); let diag = do_query(query); assert_eq!(diag.message, "table 'personx' not found"); - assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.span, Some(spans["a"])); Ok(()) } @@ -149,7 +145,7 @@ fn test_unqualified_column_not_found() -> Result<()> { let spans = get_spans(query); let diag = do_query(query); assert_eq!(diag.message, "column 'first_namex' not found"); - assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.span, Some(spans["a"])); Ok(()) } @@ -159,7 +155,7 @@ fn test_qualified_column_not_found() -> Result<()> { let spans = get_spans(query); let diag = do_query(query); assert_eq!(diag.message, "column 'first_namex' not found in 'person'"); - assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.span, Some(spans["a"])); Ok(()) } @@ -172,11 +168,11 @@ fn test_union_wrong_number_of_columns() -> Result<()> { diag.message, "UNION queries have different number of columns" ); - assert_eq!(diag.span, spans["whole"]); + assert_eq!(diag.span, Some(spans["whole"])); assert_eq!(diag.notes[0].message, "this side has 1 fields"); - assert_eq!(diag.notes[0].span, spans["left"]); + assert_eq!(diag.notes[0].span, Some(spans["left"])); assert_eq!(diag.notes[1].message, "this side has 2 fields"); - assert_eq!(diag.notes[1].span, spans["right"]); + assert_eq!(diag.notes[1].span, Some(spans["right"])); Ok(()) } @@ -189,7 +185,7 @@ fn test_missing_non_aggregate_in_group_by() -> Result<()> { diag.message, "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" ); - assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diag.helps[0].message, "add 'person.first_name' to GROUP BY clause" @@ -203,7 +199,7 @@ fn test_ambiguous_reference() -> Result<()> { let spans = get_spans(query); let diag = do_query(query); assert_eq!(diag.message, "column 'first_name' is ambiguous"); - assert_eq!(diag.span, spans["a"]); + assert_eq!(diag.span, Some(spans["a"])); assert_eq!( diag.notes[0].message, "possible reference to 'first_name' in table 'a'" @@ -222,10 +218,10 @@ fn test_incompatible_types_binary_arithmetic() -> Result<()> { let spans = get_spans(query); let diag = do_query(query); assert_eq!(diag.message, "expressions have incompatible types"); - assert_eq!(diag.span, spans["whole"]); + assert_eq!(diag.span, Some(spans["whole"])); assert_eq!(diag.notes[0].message, "has type UInt32"); - assert_eq!(diag.notes[0].span, spans["left"]); + assert_eq!(diag.notes[0].span, Some(spans["left"])); assert_eq!(diag.notes[1].message, "has type Utf8"); - assert_eq!(diag.notes[1].span, spans["right"]); + assert_eq!(diag.notes[1].span, Some(spans["right"])); Ok(()) } diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index 75bb7fccb213..496c6c609e45 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -4064,12 +4064,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t1.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t1.t1_int))) 11)----------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t1" }), name: "t1_int", spans: Spans([]) }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t1" }), name: "t1_int" }) # Test CROSS JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int", spans: Spans\(\[\]\) \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t1" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t1 cross join lateral (select * from unnest(generate_series(1, t1_int))) as series(i); @@ -4089,12 +4089,12 @@ logical_plan 09)------------Unnest: lists[__unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int)))|depth=1] structs[] 10)--------------Projection: generate_series(Int64(1), CAST(outer_ref(t2.t1_int) AS Int64)) AS __unnest_placeholder(generate_series(Int64(1),outer_ref(t2.t1_int))) 11)----------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t2" }), name: "t1_int", spans: Spans([]) }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(UInt32, Column { relation: Some(Bare { table: "t2" }), name: "t1_int" }) # Test INNER JOIN LATERAL syntax (execution) # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int", spans: Spans\(\[\]\) \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(UInt32, Column \{ relation: Some\(Bare \{ table: "t2" \}\), name: "t1_int" \}\) select t1_id, t1_name, i from join_t1 t2 inner join lateral (select * from unnest(generate_series(1, t1_int))) as series(i) on(t1_id > i); # Test RIGHT JOIN LATERAL syntax (unsupported) diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 88c363cd278e..a674f7b7f422 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -863,11 +863,11 @@ select count(*) from (select unnest(range(0, 100000)) id) t inner join (select u # Test implicit LATERAL support for UNNEST # Issue: https://github.com/apache/datafusion/issues/13659 # TODO: https://github.com/apache/datafusion/issues/10048 -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1", spans: Spans\(\[\]\) \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u, unnest(u.column1); # Test implicit LATERAL support for UNNEST (INNER JOIN) -query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1", spans: Spans\(\[\]\) \}\) +query error DataFusion error: This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn\(List\(Field \{ name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: \{\} \}\), Column \{ relation: Some\(Bare \{ table: "u" \}\), name: "column1" \}\) select * from unnest_table u INNER JOIN unnest(u.column1) AS t(column1) ON u.column3 = t.column1; # Test implicit LATERAL planning for UNNEST @@ -883,7 +883,7 @@ logical_plan 06)------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 07)--------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 08)----------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1", spans: Spans([]) }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" }) # Test implicit LATERAL planning for UNNEST (INNER JOIN) query TT @@ -899,7 +899,7 @@ logical_plan 07)--------Unnest: lists[__unnest_placeholder(outer_ref(u.column1))|depth=1] structs[] 08)----------Projection: outer_ref(u.column1) AS __unnest_placeholder(outer_ref(u.column1)) 09)------------EmptyRelation -physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1", spans: Spans([]) }) +physical_plan_error This feature is not implemented: Physical plan does not support logical expression OuterReferenceColumn(List(Field { name: "item", data_type: Int64, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }), Column { relation: Some(Bare { table: "u" }), name: "column1" }) ## Unnest in subquery From 1cd0f3e82c3293c51244b60a61ab8f3d3cce74fb Mon Sep 17 00:00:00 2001 From: Elia Perantoni Date: Tue, 28 Jan 2025 21:23:08 +0100 Subject: [PATCH 33/33] fix: ci checks --- datafusion-cli/Cargo.lock | 41 ++++++++++++++--------------- datafusion/common/src/diagnostic.rs | 4 --- datafusion/common/src/spans.rs | 1 + 3 files changed, 21 insertions(+), 25 deletions(-) diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index e61af902891a..5206ddf93d66 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -884,9 +884,9 @@ dependencies = [ [[package]] name = "bumpalo" -version = "3.16.0" +version = "3.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c" +checksum = "1628fb46dfa0b37568d12e5edd512553eccf6a22a78e8bde00bb4aed84d5bdbf" [[package]] name = "byteorder" @@ -1401,7 +1401,6 @@ dependencies = [ "datafusion-common", "itertools 0.14.0", "paste", - "sqlparser", ] [[package]] @@ -2122,9 +2121,9 @@ dependencies = [ [[package]] name = "httparse" -version = "1.9.5" +version = "1.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7d71d3574edd2771538b901e6549113b4006ece66150fb69c0fb6d9a2adae946" +checksum = "f2d708df4e7140240a16cd6ab0ab65c972d7433ab77819ea693fde9c43811e2a" [[package]] name = "httpdate" @@ -2164,9 +2163,9 @@ dependencies = [ [[package]] name = "hyper" -version = "1.5.2" +version = "1.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "256fb8d4bd6413123cc9d91832d78325c48ff41677595be797d90f42969beae0" +checksum = "cc2b571658e38e0c01b1fdca3bbbe93c00d3d71693ff2770043f8c29bc7d6f80" dependencies = [ "bytes", "futures-channel", @@ -2206,7 +2205,7 @@ checksum = "2d191583f3da1305256f22463b9bb0471acad48a4e534a5218b9963e9c1f59b2" dependencies = [ "futures-util", "http 1.2.0", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-util", "rustls 0.23.21", "rustls-native-certs 0.8.1", @@ -2227,7 +2226,7 @@ dependencies = [ "futures-util", "http 1.2.0", "http-body 1.0.1", - "hyper 1.5.2", + "hyper 1.6.0", "pin-project-lite", "socket2", "tokio", @@ -2822,7 +2821,7 @@ dependencies = [ "chrono", "futures", "humantime", - "hyper 1.5.2", + "hyper 1.6.0", "itertools 0.13.0", "md-5", "parking_lot", @@ -3304,7 +3303,7 @@ dependencies = [ "http 1.2.0", "http-body 1.0.1", "http-body-util", - "hyper 1.5.2", + "hyper 1.6.0", "hyper-rustls 0.27.5", "hyper-util", "ipnet", @@ -3491,9 +3490,9 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.10.1" +version = "1.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2bf47e6ff922db3825eb750c4e2ff784c6ff8fb9e13046ef6a1d1c5401b0b37" +checksum = "917ce264624a4b4db1c364dcc35bfca9ded014d0a958cd47ad3e960e988ea51c" dependencies = [ "web-time", ] @@ -3549,9 +3548,9 @@ dependencies = [ [[package]] name = "ryu" -version = "1.0.18" +version = "1.0.19" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f3cb5ba0dc43242ce17de99c180e96db90b235b8a9fdc9543c96d2209116bd9f" +checksum = "6ea1a2d0a644769cc99faa24c3ad26b379b786fe7c36fd3c546254801650e6dd" [[package]] name = "same-file" @@ -3666,9 +3665,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.137" +version = "1.0.138" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "930cfb6e6abf99298aaad7d29abbef7a9999a9a8806a40088f55f0dcec03146b" +checksum = "d434192e7da787e94a6ea7e9670b26a036d0ca41e0b7efb2676dd32bae872949" dependencies = [ "itoa", "memchr", @@ -4202,9 +4201,9 @@ checksum = "42ff0bf0c66b8238c6f3b578df37d0b7848e55df8577b3f74f92a69acceeb825" [[package]] name = "unicode-ident" -version = "1.0.15" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "11cd88e12b17c6494200a9c1b683a04fcac9573ed74cd1b62aeb2727c5592243" +checksum = "a210d160f08b701c8721ba1c726c11662f877ea6b7094007e1ca9a1041945034" [[package]] name = "unicode-segmentation" @@ -4557,9 +4556,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.24" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c8d71a593cc5c42ad7876e2c1fda56f314f3754c084128833e64f1345ff8a03a" +checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310" dependencies = [ "memchr", ] diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs index 7b635a574b00..0dce8e6a56ec 100644 --- a/datafusion/common/src/diagnostic.rs +++ b/datafusion/common/src/diagnostic.rs @@ -25,10 +25,6 @@ use crate::Span; /// rustc. i.e. either an error or a warning, optionally with some notes and /// help messages. /// -/// If the diagnostic, a note, or a help message doesn't need to point to a -/// specific location in the original SQL query (or the [`Span`] is not -/// available), use [`Span::empty`]. -/// /// Example: /// /// ```rust diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs index 0d137cb61e77..40ebdeffb601 100644 --- a/datafusion/common/src/spans.rs +++ b/datafusion/common/src/spans.rs @@ -14,6 +14,7 @@ // KIND, either express or implied. See the License for the // specific language governing permissions and limitations // under the License. + use std::cmp::{self, Ordering}; use std::fmt; use std::hash::{Hash, Hasher};