diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 4a4d0157c620..6982ad86e57e 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -31,7 +31,7 @@ checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" dependencies = [ "cfg-if", "const-random", - "getrandom 0.2.15", + "getrandom", "once_cell", "version_check", "zerocopy", @@ -449,9 +449,9 @@ checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "aws-config" -version = "1.5.15" +version = "1.5.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dc47e70fc35d054c8fcd296d47a61711f043ac80534a10b4f741904f81e73a90" +checksum = "9b49afaa341e8dd8577e1a2200468f98956d6eda50bcf4a53246cc00174ba924" dependencies = [ "aws-credential-types", "aws-runtime", @@ -460,7 +460,7 @@ dependencies = [ "aws-sdk-sts", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.60.7", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -524,7 +524,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.2", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -546,7 +546,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.2", "aws-smithy-runtime", "aws-smithy-runtime-api", "aws-smithy-types", @@ -568,7 +568,7 @@ dependencies = [ "aws-runtime", "aws-smithy-async", "aws-smithy-http", - "aws-smithy-json", + "aws-smithy-json 0.61.2", "aws-smithy-query", "aws-smithy-runtime", "aws-smithy-runtime-api", @@ -635,6 +635,15 @@ dependencies = [ "tracing", ] +[[package]] +name = "aws-smithy-json" +version = "0.60.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4683df9469ef09468dad3473d129960119a0d3593617542b7d52086c8486f2d6" +dependencies = [ + "aws-smithy-types", +] + [[package]] name = "aws-smithy-json" version = "0.61.2" @@ -1070,7 +1079,7 @@ version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" dependencies = [ - "getrandom 0.2.15", + "getrandom", "once_cell", "tiny-keccak", ] @@ -1943,22 +1952,10 @@ dependencies = [ "cfg-if", "js-sys", "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "wasm-bindgen", ] -[[package]] -name = "getrandom" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a49c392881ce6d5c3b8cb70f98717b7c07aabbdff06687b9030dbfbe2725f8" -dependencies = [ - "cfg-if", - "libc", - "wasi 0.13.3+wasi-0.2.2", - "windows-targets", -] - [[package]] name = "gimli" version = "0.31.1" @@ -2691,7 +2688,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "2886843bf800fba2e3377cff24abf6379b4c4d5c6681eaf9ea5b0d15090450bd" dependencies = [ "libc", - "wasi 0.11.0+wasi-snapshot-preview1", + "wasi", "windows-sys 0.52.0", ] @@ -3135,7 +3132,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a2fe5ef3495d7d2e377ff17b1a8ce2ee2ec2a18cde8b6ad6619d65d0701c135d" dependencies = [ "bytes", - "getrandom 0.2.15", + "getrandom", "rand", "ring", "rustc-hash", @@ -3208,7 +3205,7 @@ version = "0.6.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" dependencies = [ - "getrandom 0.2.15", + "getrandom", ] [[package]] @@ -3246,7 +3243,7 @@ version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dd6f9d3d47bdd2ad6945c5015a226ec6155d0bcdfd8f7cd29f86b71f8de99d2b" dependencies = [ - "getrandom 0.2.15", + "getrandom", "libredox", "thiserror 2.0.11", ] @@ -3346,7 +3343,7 @@ checksum = "c17fa4cb658e3583423e915b9f3acc01cceaee1860e33d59ebae66adc3a2dc0d" dependencies = [ "cc", "cfg-if", - "getrandom 0.2.15", + "getrandom", "libc", "spin", "untrusted", @@ -3890,13 +3887,13 @@ dependencies = [ [[package]] name = "tempfile" -version = "3.16.0" +version = "3.15.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "38c246215d7d24f48ae091a2902398798e05d978b24315d6efbc00ede9a8bb91" +checksum = "9a8a559c81686f576e8cd0290cd2a24a2a9ad80c98b3478856500fcbd7acd704" dependencies = [ "cfg-if", "fastrand", - "getrandom 0.3.1", + "getrandom", "once_cell", "rustix", "windows-sys 0.59.0", @@ -4267,7 +4264,7 @@ version = "1.12.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b3758f5e68192bb96cc8f9b7e2c2cfdabb435499a28499a42f8f984092adad4b" dependencies = [ - "getrandom 0.2.15", + "getrandom", "serde", ] @@ -4317,15 +4314,6 @@ version = "0.11.0+wasi-snapshot-preview1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" -[[package]] -name = "wasi" -version = "0.13.3+wasi-0.2.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "26816d2e1a4a36a2940b96c5296ce403917633dff8f3440e9b236ed6f6bacad2" -dependencies = [ - "wit-bindgen-rt", -] - [[package]] name = "wasm-bindgen" version = "0.2.100" @@ -4562,22 +4550,13 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "winnow" -version = "0.6.26" +version = "0.6.25" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e90edd2ac1aa278a5c4599b1d89cf03074b610800f866d4026dc199d7929a28" +checksum = "ad699df48212c6cc6eb4435f35500ac6fd3b9913324f938aea302022ce19d310" dependencies = [ "memchr", ] -[[package]] -name = "wit-bindgen-rt" -version = "0.33.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3268f3d866458b787f390cf61f4bbb563b922d091359f9608842999eaee3943c" -dependencies = [ - "bitflags 2.8.0", -] - [[package]] name = "write16" version = "1.0.0" diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index fdde3d69eddb..9973a72e7bc0 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,23 +17,33 @@ //! Column -use arrow_schema::{Field, FieldRef}; - use crate::error::_schema_err; use crate::utils::{parse_identifiers_normalized, quote_identifier}; -use crate::{DFSchema, Result, SchemaError, TableReference}; +use crate::{DFSchema, Diagnostic, Result, SchemaError, Spans, TableReference}; +use arrow_schema::{Field, FieldRef}; use std::collections::HashSet; use std::convert::Infallible; use std::fmt; use std::str::FromStr; /// A named reference to a qualified field in a schema. -#[derive(Debug, Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] +#[derive(Clone, PartialEq, Eq, Hash, PartialOrd, Ord)] pub struct Column { /// relation/table reference. pub relation: Option, /// field/column name. pub name: String, + /// Original source code location, if known + pub spans: Spans, +} + +impl fmt::Debug for Column { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + f.debug_struct("Column") + .field("relation", &self.relation) + .field("name", &self.name) + .finish() + } } impl Column { @@ -50,6 +60,7 @@ impl Column { Self { relation: relation.map(|r| r.into()), name: name.into(), + spans: Spans::new(), } } @@ -58,6 +69,7 @@ impl Column { Self { relation: None, name: name.into(), + spans: Spans::new(), } } @@ -68,6 +80,7 @@ impl Column { Self { relation: None, name: name.into(), + spans: Spans::new(), } } @@ -99,7 +112,11 @@ impl Column { // identifiers will be treated as an unqualified column name _ => return None, }; - Some(Self { relation, name }) + Some(Self { + relation, + name, + spans: Spans::new(), + }) } /// Deserialize a fully qualified name string into a column @@ -113,6 +130,7 @@ impl Column { Self { relation: None, name: flat_name, + spans: Spans::new(), }, ) } @@ -124,6 +142,7 @@ impl Column { Self { relation: None, name: flat_name, + spans: Spans::new(), }, ) } @@ -239,7 +258,30 @@ impl Column { // If not due to USING columns then due to ambiguous column name return _schema_err!(SchemaError::AmbiguousReference { - field: Column::new_unqualified(self.name), + field: Column::new_unqualified(&self.name), + }) + .map_err(|err| { + let mut diagnostic = Diagnostic::new_error( + format!("column '{}' is ambiguous", &self.name), + self.spans().first(), + ); + // TODO If [`DFSchema`] had spans, we could show the + // user which columns are candidates, or which table + // they come from. For now, let's list the table names + // only. + for qualified_field in qualified_fields { + let (Some(table), _) = qualified_field else { + continue; + }; + diagnostic.add_note( + format!( + "possible reference to '{}' in table '{}'", + &self.name, table + ), + None, + ); + } + err.with_diagnostic(diagnostic) }); } } @@ -254,6 +296,33 @@ impl Column { .collect(), }) } + + /// Returns a reference to the set of locations in the SQL query where this + /// column appears, if known. + pub fn spans(&self) -> &Spans { + &self.spans + } + + /// Returns a mutable reference to the set of locations in the SQL query + /// where this column appears, if known. + pub fn spans_mut(&mut self) -> &mut Spans { + &mut self.spans + } + + /// Replaces the set of locations in the SQL query where this column + /// appears, if known. + pub fn with_spans(mut self, spans: Spans) -> Self { + self.spans = spans; + self + } + + /// Qualifies the column with the given table reference. + pub fn with_relation(&self, relation: TableReference) -> Self { + Self { + relation: Some(relation), + ..self.clone() + } + } } impl From<&str> for Column { diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 32b7213d952f..320417c35a79 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -250,6 +250,11 @@ config_namespace! { /// specified. The Arrow type system does not have a notion of maximum /// string length and thus DataFusion can not enforce such limits. pub support_varchar_with_length: bool, default = true + + /// When set to true, the source locations relative to the original SQL + /// query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected + /// and recorded in the logical plan nodes. + pub collect_spans: bool, default = false } } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 302d515e027e..2ac629432ce9 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -502,10 +502,7 @@ impl DFSchema { Ok((fields_without_qualifier[0].0, fields_without_qualifier[0].1)) } else { _schema_err!(SchemaError::AmbiguousReference { - field: Column { - relation: None, - name: name.to_string(), - }, + field: Column::new_unqualified(name.to_string(),), }) } } diff --git a/datafusion/common/src/diagnostic.rs b/datafusion/common/src/diagnostic.rs new file mode 100644 index 000000000000..0dce8e6a56ec --- /dev/null +++ b/datafusion/common/src/diagnostic.rs @@ -0,0 +1,146 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use crate::Span; + +/// Additional contextual information intended for end users, to help them +/// understand what went wrong by providing human-readable messages, and +/// locations in the source query that relate to the error in some way. +/// +/// You can think of a single [`Diagnostic`] as a single "block" of output from +/// rustc. i.e. either an error or a warning, optionally with some notes and +/// help messages. +/// +/// Example: +/// +/// ```rust +/// # use datafusion_common::{Location, Span, Diagnostic}; +/// let span = Some(Span { +/// start: Location{ line: 2, column: 1 }, +/// end: Location{ line: 4, column: 15 } +/// }); +/// let diagnostic = Diagnostic::new_error("Something went wrong", span) +/// .with_help("Have you tried turning it on and off again?", None); +/// ``` +#[derive(Debug, Clone)] +pub struct Diagnostic { + pub kind: DiagnosticKind, + pub message: String, + pub span: Option, + pub notes: Vec, + pub helps: Vec, +} + +/// A note enriches a [`Diagnostic`] with extra information, possibly referring +/// to different locations in the original SQL query, that helps contextualize +/// the error and helps the end user understand why it occurred. +/// +/// Example: +/// SELECT id, name FROM users GROUP BY id +/// Note: ^^^^ 'name' is not in the GROUP BY clause +#[derive(Debug, Clone)] +pub struct DiagnosticNote { + pub message: String, + pub span: Option, +} + +/// A "help" enriches a [`Diagnostic`] with extra information, possibly +/// referring to different locations in the original SQL query, that helps the +/// user understand how they might fix the error or warning. +/// +/// Example: +/// SELECT id, name FROM users GROUP BY id +/// Help: Add 'name' here ^^^^ +#[derive(Debug, Clone)] +pub struct DiagnosticHelp { + pub message: String, + pub span: Option, +} + +/// A [`Diagnostic`] can either be a hard error that prevents the query from +/// being planned and executed, or a warning that indicates potential issues, +/// performance problems, or causes for unexpected results, but is non-fatal. +/// This enum expresses these two possibilities. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum DiagnosticKind { + Error, + Warning, +} + +impl Diagnostic { + /// Creates a new [`Diagnostic`] for a fatal error that prevents the SQL + /// query from being planned and executed. Optionally takes in a [`Span`] to + /// describe the location in the source code that caused the error, should + /// be provided when available. + pub fn new_error(message: impl Into, span: Option) -> Self { + Self { + kind: DiagnosticKind::Error, + message: message.into(), + span, + notes: Vec::new(), + helps: Vec::new(), + } + } + + /// Creates a new [`Diagnostic`] for a NON-fatal warning, such as a + /// performance problem, or possible cause for undesired results. Optionally + /// takes in a [`Span`] to describe the location in the source code that + /// caused the error, should be provided when available. + pub fn new_warning(message: impl Into, span: Option) -> Self { + Self { + kind: DiagnosticKind::Warning, + message: message.into(), + span, + notes: Vec::new(), + helps: Vec::new(), + } + } + + /// Adds a "note" to the [`Diagnostic`], which can have zero or many. A "note" + /// helps contextualize the error and helps the end user understand why it + /// occurred. It can refer to an arbitrary location in the SQL query, or to + /// no location. + pub fn add_note(&mut self, message: impl Into, span: Option) { + self.notes.push(DiagnosticNote { + message: message.into(), + span, + }); + } + + /// Adds a "help" to the [`Diagnostic`], which can have zero or many. A + /// "help" helps the user understand how they might fix the error or + /// warning. It can refer to an arbitrary location in the SQL query, or to + /// no location. + pub fn add_help(&mut self, message: impl Into, span: Option) { + self.helps.push(DiagnosticHelp { + message: message.into(), + span, + }); + } + + /// Like [`Diagnostic::add_note`], but returns `self` to allow chaining. + pub fn with_note(mut self, message: impl Into, span: Option) -> Self { + self.add_note(message.into(), span); + self + } + + /// Like [`Diagnostic::add_help`], but returns `self` to allow chaining. + pub fn with_help(mut self, message: impl Into, span: Option) -> Self { + self.add_help(message.into(), span); + self + } +} diff --git a/datafusion/common/src/error.rs b/datafusion/common/src/error.rs index f7c247aaf288..0b4aab1dc7e1 100644 --- a/datafusion/common/src/error.rs +++ b/datafusion/common/src/error.rs @@ -27,7 +27,7 @@ use std::result; use std::sync::Arc; use crate::utils::quote_identifier; -use crate::{Column, DFSchema, TableReference}; +use crate::{Column, DFSchema, Diagnostic, TableReference}; #[cfg(feature = "avro")] use apache_avro::Error as AvroError; use arrow::error::ArrowError; @@ -131,6 +131,11 @@ pub enum DataFusionError { /// Errors from either mapping LogicalPlans to/from Substrait plans /// or serializing/deserializing protobytes to Substrait plans Substrait(String), + /// Error wrapped together with additional contextual information intended + /// for end users, to help them understand what went wrong by providing + /// human-readable messages, and locations in the source query that relate + /// to the error in some way. + Diagnostic(Box, Box), } #[macro_export] @@ -328,6 +333,7 @@ impl Error for DataFusionError { DataFusionError::External(e) => Some(e.as_ref()), DataFusionError::Context(_, e) => Some(e.as_ref()), DataFusionError::Substrait(_) => None, + DataFusionError::Diagnostic(_, e) => Some(e.as_ref()), } } } @@ -441,6 +447,7 @@ impl DataFusionError { DataFusionError::External(_) => "External error: ", DataFusionError::Context(_, _) => "", DataFusionError::Substrait(_) => "Substrait error: ", + DataFusionError::Diagnostic(_, _) => "", } } @@ -481,8 +488,58 @@ impl DataFusionError { Cow::Owned(format!("{desc}\ncaused by\n{}", *err)) } DataFusionError::Substrait(ref desc) => Cow::Owned(desc.to_string()), + DataFusionError::Diagnostic(_, ref err) => Cow::Owned(err.to_string()), } } + + /// Wraps the error with contextual information intended for end users + pub fn with_diagnostic(self, diagnostic: Diagnostic) -> Self { + Self::Diagnostic(Box::new(diagnostic), Box::new(self)) + } + + /// Wraps the error with contextual information intended for end users. + /// Takes a function that inspects the error and returns the diagnostic to + /// wrap it with. + pub fn with_diagnostic_fn Diagnostic>( + self, + f: F, + ) -> Self { + let diagnostic = f(&self); + self.with_diagnostic(diagnostic) + } + + /// Gets the [`Diagnostic`] associated with the error, if any. If there is + /// more than one, only the outermost [`Diagnostic`] is returned. + pub fn diagnostic(&self) -> Option<&Diagnostic> { + struct DiagnosticsIterator<'a> { + head: &'a DataFusionError, + } + + impl<'a> Iterator for DiagnosticsIterator<'a> { + type Item = &'a Diagnostic; + + fn next(&mut self) -> Option { + loop { + if let DataFusionError::Diagnostic(diagnostics, source) = self.head { + self.head = source.as_ref(); + return Some(diagnostics); + } + + if let Some(source) = self + .head + .source() + .and_then(|source| source.downcast_ref::()) + { + self.head = source; + } else { + return None; + } + } + } + } + + DiagnosticsIterator { head: self }.next() + } } /// Unwrap an `Option` if possible. Otherwise return an `DataFusionError::Internal`. diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index 77e8cd60ede2..1ad2a5f0cec3 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -33,6 +33,7 @@ pub mod alias; pub mod cast; pub mod config; pub mod cse; +pub mod diagnostic; pub mod display; pub mod error; pub mod file_options; @@ -42,6 +43,7 @@ pub mod instant; pub mod parsers; pub mod rounding; pub mod scalar; +pub mod spans; pub mod stats; pub mod test_util; pub mod tree_node; @@ -54,6 +56,7 @@ pub use column::Column; pub use dfschema::{ qualified_name, DFSchema, DFSchemaRef, ExprSchema, SchemaExt, ToDFSchema, }; +pub use diagnostic::Diagnostic; pub use error::{ field_not_found, unqualified_field_not_found, DataFusionError, Result, SchemaError, SharedResult, @@ -72,6 +75,7 @@ pub use join_type::{JoinConstraint, JoinSide, JoinType}; pub use param_value::ParamValues; pub use scalar::{ScalarType, ScalarValue}; pub use schema_reference::SchemaReference; +pub use spans::{Location, Span, Spans}; pub use stats::{ColumnStatistics, Statistics}; pub use table_reference::{ResolvedTableReference, TableReference}; pub use unnest::{RecursionUnnestOption, UnnestOptions}; diff --git a/datafusion/common/src/spans.rs b/datafusion/common/src/spans.rs new file mode 100644 index 000000000000..40ebdeffb601 --- /dev/null +++ b/datafusion/common/src/spans.rs @@ -0,0 +1,218 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::cmp::{self, Ordering}; +use std::fmt; +use std::hash::{Hash, Hasher}; + +/// Represents a location, determined by a line and a column number, in the +/// original SQL query. +#[derive(Eq, PartialEq, Hash, Clone, Copy, Ord, PartialOrd)] +pub struct Location { + /// Line number, starting from 1. + /// + /// Note: Line 0 is used for empty spans + pub line: u64, + /// Line column, starting from 1. + /// + /// Note: Column 0 is used for empty spans + pub column: u64, +} + +impl fmt::Debug for Location { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Location({},{})", self.line, self.column) + } +} + +impl From for Location { + fn from(value: sqlparser::tokenizer::Location) -> Self { + Self { + line: value.line, + column: value.column, + } + } +} + +/// Represents an interval of characters in the original SQL query. +#[derive(Eq, PartialEq, Hash, Clone, PartialOrd, Ord, Copy)] +pub struct Span { + pub start: Location, + pub end: Location, +} + +impl fmt::Debug for Span { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!(f, "Span({:?}..{:?})", self.start, self.end) + } +} + +impl Span { + /// Creates a new [`Span`] from a start and an end [`Location`]. + pub fn new(start: Location, end: Location) -> Self { + Self { start, end } + } + + /// Convert a [`Span`](sqlparser::tokenizer::Span) from the parser, into a + /// DataFusion [`Span`]. If the input span is empty (line 0 column 0, to + /// line 0 column 0), then [`None`] is returned. + pub fn try_from_sqlparser_span(span: sqlparser::tokenizer::Span) -> Option { + if span == sqlparser::tokenizer::Span::empty() { + None + } else { + Some(Span { + start: span.start.into(), + end: span.end.into(), + }) + } + } + + /// Returns the smallest Span that contains both `self` and `other` + /// + /// # Examples + /// ``` + /// # use sqlparser::tokenizer::{Span, Location}; + /// // line 1, column1 -> line 2, column 5 + /// let span1 = Span::new(Location::new(1, 1), Location::new(2, 5)); + /// // line 2, column 3 -> line 3, column 7 + /// let span2 = Span::new(Location::new(2, 3), Location::new(3, 7)); + /// // Union of the two is the min/max of the two spans + /// // line 1, column 1 -> line 3, column 7 + /// let union = span1.union(&span2); + /// assert_eq!(union, Span::new(Location::new(1, 1), Location::new(3, 7))); + /// ``` + pub fn union(&self, other: &Span) -> Span { + Span { + start: cmp::min(self.start, other.start), + end: cmp::max(self.end, other.end), + } + } + + /// Same as [Span::union] for `Option`. + /// + /// If `other` is `None`, `self` is returned. + pub fn union_opt(&self, other: &Option) -> Span { + match other { + Some(other) => self.union(other), + None => *self, + } + } + + /// Return the [Span::union] of all spans in the iterator. + /// + /// If the iterator is empty, [`None`] is returned. + /// + /// # Example + /// ``` + /// # use sqlparser::tokenizer::{Span, Location}; + /// let spans = vec![ + /// Span::new(Location::new(1, 1), Location::new(2, 5)), + /// Span::new(Location::new(2, 3), Location::new(3, 7)), + /// Span::new(Location::new(3, 1), Location::new(4, 2)), + /// ]; + /// // line 1, column 1 -> line 4, column 2 + /// assert_eq!( + /// Span::union_iter(spans), + /// Span::new(Location::new(1, 1), Location::new(4, 2)) + /// ); + pub fn union_iter>(iter: I) -> Option { + iter.into_iter().reduce(|acc, item| acc.union(&item)) + } +} + +/// A collection of [`Span`], meant to be used as a field of entities whose +/// location in the original SQL query is desired to be tracked. Sometimes an +/// entity can have multiple spans. e.g. if you want to track the position of +/// the column a that comes from SELECT 1 AS a UNION ALL SELECT 2 AS a you'll +/// need two spans. +#[derive(Debug, Clone)] +// Store teh first [`Span`] on the stack because that is by far the most common +// case. More will spill onto the heap. +pub struct Spans(pub Vec); + +impl Spans { + /// Creates a new empty [`Spans`] with no [`Span`]. + pub fn new() -> Self { + Spans(Vec::new()) + } + + /// Returns the first [`Span`], if any. This is useful when you know that + /// there's gonna be only one [`Span`] at most. + pub fn first(&self) -> Option { + self.0.first().copied() + } + + /// Returns a slice of the [`Span`]s. + pub fn get_spans(&self) -> &[Span] { + &self.0 + } + + /// Adds a [`Span`] to the collection. + pub fn add_span(&mut self, span: Span) { + self.0.push(span); + } + + /// Iterates over the [`Span`]s. + pub fn iter(&self) -> impl Iterator { + self.0.iter() + } +} + +impl Default for Spans { + fn default() -> Self { + Self::new() + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl PartialEq for Spans { + fn eq(&self, _other: &Self) -> bool { + true + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Eq for Spans {} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl PartialOrd for Spans { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Ord for Spans { + fn cmp(&self, _other: &Self) -> Ordering { + Ordering::Equal + } +} + +// Since [`Spans`] will be used as a field in other structs, we don't want it to +// interfere with the equality and ordering of the entities themselves, since +// this is just diagnostics information for the end user. +impl Hash for Spans { + fn hash(&self, _state: &mut H) {} +} diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index a0aa6447871c..61ddba10b09c 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -1819,6 +1819,7 @@ mod tests { use crate::test_util::{plan_and_collect, populate_csv_partitions}; use arrow_schema::{DataType, TimeUnit}; use std::env; + use std::error::Error; use std::path::PathBuf; use datafusion_common_runtime::SpawnedTask; @@ -2043,10 +2044,16 @@ mod tests { Err(DataFusionError::Plan(_)) )); - assert!(matches!( - ctx.sql("select * from datafusion.public.test").await, - Err(DataFusionError::Plan(_)) - )); + let err = ctx + .sql("select * from datafusion.public.test") + .await + .unwrap_err(); + let err = err + .source() + .and_then(|err| err.downcast_ref::()) + .unwrap(); + + assert!(matches!(err, &DataFusionError::Plan(_))); Ok(()) } diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index f83b8528d1e8..4f752dc69be5 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -582,6 +582,7 @@ impl SessionState { enable_options_value_normalization: sql_parser_options .enable_options_value_normalization, support_varchar_with_length: sql_parser_options.support_varchar_with_length, + collect_spans: sql_parser_options.collect_spans, } } diff --git a/datafusion/core/tests/execution/logical_plan.rs b/datafusion/core/tests/execution/logical_plan.rs index 168bf484e541..9f093ccf24b6 100644 --- a/datafusion/core/tests/execution/logical_plan.rs +++ b/datafusion/core/tests/execution/logical_plan.rs @@ -18,7 +18,7 @@ use arrow_array::Int64Array; use arrow_schema::{DataType, Field}; use datafusion::execution::session_state::SessionStateBuilder; -use datafusion_common::{Column, DFSchema, Result, ScalarValue}; +use datafusion_common::{Column, DFSchema, Result, ScalarValue, Spans}; use datafusion_execution::TaskContext; use datafusion_expr::expr::AggregateFunction; use datafusion_expr::logical_plan::{LogicalPlan, Values}; @@ -51,6 +51,7 @@ async fn count_only_nulls() -> Result<()> { let input_col_ref = Expr::Column(Column { relation: None, name: "col".to_string(), + spans: Spans::new(), }); // Aggregation: count(col) AS count diff --git a/datafusion/expr-common/src/interval_arithmetic.rs b/datafusion/expr-common/src/interval_arithmetic.rs index 993051eaeee1..549461e0be88 100644 --- a/datafusion/expr-common/src/interval_arithmetic.rs +++ b/datafusion/expr-common/src/interval_arithmetic.rs @@ -18,7 +18,7 @@ //! Interval arithmetic library use crate::operator::Operator; -use crate::type_coercion::binary::get_result_type; +use crate::type_coercion::binary::BinaryTypeCoercer; use std::borrow::Borrow; use std::fmt::{self, Display, Formatter}; use std::ops::{AddAssign, SubAssign}; @@ -1246,7 +1246,10 @@ impl Interval { /// to an error. pub fn equal>(&self, other: T) -> Result { let rhs = other.borrow(); - if get_result_type(&self.data_type(), &Operator::Eq, &rhs.data_type()).is_err() { + if BinaryTypeCoercer::new(&self.data_type(), &Operator::Eq, &rhs.data_type()) + .get_result_type() + .is_err() + { internal_err!( "Interval data types must be compatible for equality checks, lhs:{}, rhs:{}", self.data_type(), @@ -1417,7 +1420,9 @@ impl Interval { /// choose single values arbitrarily from each of the operands. pub fn add>(&self, other: T) -> Result { let rhs = other.borrow(); - let dt = get_result_type(&self.data_type(), &Operator::Plus, &rhs.data_type())?; + let dt = + BinaryTypeCoercer::new(&self.data_type(), &Operator::Plus, &rhs.data_type()) + .get_result_type()?; Ok(Self::new( add_bounds::(&dt, &self.lower, &rhs.lower), @@ -1432,7 +1437,9 @@ impl Interval { /// each of the operands. pub fn sub>(&self, other: T) -> Result { let rhs = other.borrow(); - let dt = get_result_type(&self.data_type(), &Operator::Minus, &rhs.data_type())?; + let dt = + BinaryTypeCoercer::new(&self.data_type(), &Operator::Minus, &rhs.data_type()) + .get_result_type()?; Ok(Self::new( sub_bounds::(&dt, &self.lower, &rhs.upper), diff --git a/datafusion/expr-common/src/type_coercion/binary.rs b/datafusion/expr-common/src/type_coercion/binary.rs index 571c17119427..3195218ea28e 100644 --- a/datafusion/expr-common/src/type_coercion/binary.rs +++ b/datafusion/expr-common/src/type_coercion/binary.rs @@ -30,7 +30,8 @@ use arrow::datatypes::{ }; use datafusion_common::types::NativeType; use datafusion_common::{ - exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, Result, + exec_datafusion_err, exec_err, internal_err, plan_datafusion_err, plan_err, + Diagnostic, Result, Span, Spans, }; use itertools::Itertools; @@ -68,11 +69,64 @@ impl Signature { } } -/// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` -fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result { - use arrow::datatypes::DataType::*; - use Operator::*; - match op { +/// Provides type information about a binary expression, coercing different +/// input types into a sensible output type. +pub struct BinaryTypeCoercer<'a> { + lhs: &'a DataType, + op: &'a Operator, + rhs: &'a DataType, + + lhs_spans: Spans, + op_spans: Spans, + rhs_spans: Spans, +} + +impl<'a> BinaryTypeCoercer<'a> { + /// Creates a new [`BinaryTypeCoercer`], for reasoning about the input + /// and output types of a binary expression. + pub fn new(lhs: &'a DataType, op: &'a Operator, rhs: &'a DataType) -> Self { + Self { + lhs, + op, + rhs, + lhs_spans: Spans::new(), + op_spans: Spans::new(), + rhs_spans: Spans::new(), + } + } + + /// Sets the spans information for the left side of the binary expression, + /// so better diagnostics can be provided in case of errors. + pub fn set_lhs_spans(&mut self, spans: Spans) { + self.lhs_spans = spans; + } + + /// Sets the spans information for the operator of the binary expression, so + /// better diagnostics can be provided in case of errors. + pub fn set_op_spans(&mut self, spans: Spans) { + self.op_spans = spans; + } + + /// Sets the spans information for the right side of the binary expression, + /// so better diagnostics can be provided in case of errors. + pub fn set_rhs_spans(&mut self, spans: Spans) { + self.rhs_spans = spans; + } + + fn span(&self) -> Option { + Span::union_iter( + [self.lhs_spans.first(), self.rhs_spans.first()] + .iter() + .copied() + .flatten(), + ) + } + + /// Returns a [`Signature`] for applying `op` to arguments of type `lhs` and `rhs` + fn signature(&'a self) -> Result { + use arrow::datatypes::DataType::*; + use Operator::*; + let result = match self.op { Eq | NotEq | Lt | @@ -81,46 +135,49 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result GtEq | IsDistinctFrom | IsNotDistinctFrom => { - comparison_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + comparison_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for comparison operation {lhs} {op} {rhs}" + "Cannot infer common argument type for comparison operation {} {} {}", + self.lhs, + self.op, + self.rhs ) }) } - And | Or => if matches!((lhs, rhs), (Boolean | Null, Boolean | Null)) { + And | Or => if matches!((self.lhs, self.rhs), (Boolean | Null, Boolean | Null)) { // Logical binary boolean operators can only be evaluated for // boolean or null arguments. Ok(Signature::uniform(Boolean)) } else { plan_err!( - "Cannot infer common argument type for logical boolean operation {lhs} {op} {rhs}" + "Cannot infer common argument type for logical boolean operation {} {} {}", self.lhs, self.op, self.rhs ) } RegexMatch | RegexIMatch | RegexNotMatch | RegexNotIMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" + "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs ) }) } LikeMatch | ILikeMatch | NotLikeMatch | NotILikeMatch => { - regex_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + regex_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common argument type for regex operation {lhs} {op} {rhs}" + "Cannot infer common argument type for regex operation {} {} {}", self.lhs, self.op, self.rhs ) }) } BitwiseAnd | BitwiseOr | BitwiseXor | BitwiseShiftRight | BitwiseShiftLeft => { - bitwise_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + bitwise_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common type for bitwise operation {lhs} {op} {rhs}" + "Cannot infer common type for bitwise operation {} {} {}", self.lhs, self.op, self.rhs ) }) } StringConcat => { - string_concat_coercion(lhs, rhs).map(Signature::uniform).ok_or_else(|| { + string_concat_coercion(self.lhs, self.rhs).map(Signature::uniform).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common string type for string concat operation {lhs} {op} {rhs}" + "Cannot infer common string type for string concat operation {} {} {}", self.lhs, self.op, self.rhs ) }) } @@ -128,9 +185,9 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result // ArrowAt and AtArrow check for whether one array is contained in another. // The result type is boolean. Signature::comparison defines this signature. // Operation has nothing to do with comparison - array_coercion(lhs, rhs).map(Signature::comparison).ok_or_else(|| { + array_coercion(self.lhs, self.rhs).map(Signature::comparison).ok_or_else(|| { plan_datafusion_err!( - "Cannot infer common array type for arrow operation {lhs} {op} {rhs}" + "Cannot infer common array type for arrow operation {} {} {}", self.lhs, self.op, self.rhs ) }) } @@ -140,7 +197,7 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result let l = new_empty_array(lhs); let r = new_empty_array(rhs); - let result = match op { + let result = match self.op { Plus => add_wrapping(&l, &r), Minus => sub_wrapping(&l, &r), Multiply => mul_wrapping(&l, &r), @@ -151,19 +208,19 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result result.map(|x| x.data_type().clone()) }; - if let Ok(ret) = get_result(lhs, rhs) { + if let Ok(ret) = get_result(self.lhs, self.rhs) { // Temporal arithmetic, e.g. Date32 + Interval Ok(Signature{ - lhs: lhs.clone(), - rhs: rhs.clone(), + lhs: self.lhs.clone(), + rhs: self.rhs.clone(), ret, }) - } else if let Some(coerced) = temporal_coercion_strict_timezone(lhs, rhs) { + } else if let Some(coerced) = temporal_coercion_strict_timezone(self.lhs, self.rhs) { // Temporal arithmetic by first coercing to a common time representation // e.g. Date32 - Timestamp let ret = get_result(&coerced, &coerced).map_err(|e| { plan_datafusion_err!( - "Cannot get result type for temporal operation {coerced} {op} {coerced}: {e}" + "Cannot get result type for temporal operation {coerced} {} {coerced}: {e}", self.op ) })?; Ok(Signature{ @@ -171,11 +228,11 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs: coerced, ret, }) - } else if let Some((lhs, rhs)) = math_decimal_coercion(lhs, rhs) { + } else if let Some((lhs, rhs)) = math_decimal_coercion(self.lhs, self.rhs) { // Decimal arithmetic, e.g. Decimal(10, 2) + Decimal(10, 0) let ret = get_result(&lhs, &rhs).map_err(|e| { plan_datafusion_err!( - "Cannot get result type for decimal operation {lhs} {op} {rhs}: {e}" + "Cannot get result type for decimal operation {} {} {}: {e}", self.lhs, self.op, self.rhs ) })?; Ok(Signature{ @@ -183,36 +240,38 @@ fn signature(lhs: &DataType, op: &Operator, rhs: &DataType) -> Result rhs, ret, }) - } else if let Some(numeric) = mathematics_numerical_coercion(lhs, rhs) { + } else if let Some(numeric) = mathematics_numerical_coercion(self.lhs, self.rhs) { // Numeric arithmetic, e.g. Int32 + Int32 Ok(Signature::uniform(numeric)) } else { plan_err!( - "Cannot coerce arithmetic expression {lhs} {op} {rhs} to valid types" + "Cannot coerce arithmetic expression {} {} {} to valid types", self.lhs, self.op, self.rhs ) } } + }; + result.map_err(|err| { + let diagnostic = + Diagnostic::new_error("expressions have incompatible types", self.span()) + .with_note(format!("has type {}", self.lhs), self.lhs_spans.first()) + .with_note(format!("has type {}", self.rhs), self.rhs_spans.first()); + err.with_diagnostic(diagnostic) + }) } -} -/// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types -pub fn get_result_type( - lhs: &DataType, - op: &Operator, - rhs: &DataType, -) -> Result { - signature(lhs, op, rhs).map(|sig| sig.ret) -} + /// Returns the resulting type of a binary expression evaluating the `op` with the left and right hand types + pub fn get_result_type(&'a self) -> Result { + self.signature().map(|sig| sig.ret) + } -/// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types -pub fn get_input_types( - lhs: &DataType, - op: &Operator, - rhs: &DataType, -) -> Result<(DataType, DataType)> { - signature(lhs, op, rhs).map(|sig| (sig.lhs, sig.rhs)) + /// Returns the coerced input types for a binary expression evaluating the `op` with the left and right hand types + pub fn get_input_types(&'a self) -> Result<(DataType, DataType)> { + self.signature().map(|sig| (sig.lhs, sig.rhs)) + } } +// TODO Move the rest inside of BinaryTypeCoercer + /// Coercion rules for mathematics operators between decimal and non-decimal types. fn math_decimal_coercion( lhs_type: &DataType, @@ -1475,8 +1534,9 @@ mod tests { #[test] fn test_coercion_error() -> Result<()> { - let result_type = - get_input_types(&DataType::Float32, &Operator::Plus, &DataType::Utf8); + let coercer = + BinaryTypeCoercer::new(&DataType::Float32, &Operator::Plus, &DataType::Utf8); + let result_type = coercer.get_input_types(); let e = result_type.unwrap_err(); assert_eq!(e.strip_backtrace(), "Error during planning: Cannot coerce arithmetic expression Float32 + Utf8 to valid types"); @@ -1519,14 +1579,16 @@ mod tests { for (i, input_type) in input_types.iter().enumerate() { let expect_type = &result_types[i]; for op in comparison_op_types { - let (lhs, rhs) = get_input_types(&input_decimal, &op, input_type)?; + let (lhs, rhs) = BinaryTypeCoercer::new(&input_decimal, &op, input_type) + .get_input_types()?; assert_eq!(expect_type, &lhs); assert_eq!(expect_type, &rhs); } } // negative test let result_type = - get_input_types(&input_decimal, &Operator::Eq, &DataType::Boolean); + BinaryTypeCoercer::new(&input_decimal, &Operator::Eq, &DataType::Boolean) + .get_input_types(); assert!(result_type.is_err()); Ok(()) } @@ -1619,7 +1681,8 @@ mod tests { /// the result type is `$RESULT_TYPE` macro_rules! test_coercion_binary_rule { ($LHS_TYPE:expr, $RHS_TYPE:expr, $OP:expr, $RESULT_TYPE:expr) => {{ - let (lhs, rhs) = get_input_types(&$LHS_TYPE, &$OP, &$RHS_TYPE)?; + let (lhs, rhs) = + BinaryTypeCoercer::new(&$LHS_TYPE, &$OP, &$RHS_TYPE).get_input_types()?; assert_eq!(lhs, $RESULT_TYPE); assert_eq!(rhs, $RESULT_TYPE); }}; @@ -1645,17 +1708,20 @@ mod tests { #[test] fn test_date_timestamp_arithmetic_error() -> Result<()> { - let (lhs, rhs) = get_input_types( + let (lhs, rhs) = BinaryTypeCoercer::new( &DataType::Timestamp(TimeUnit::Nanosecond, None), &Operator::Minus, &DataType::Timestamp(TimeUnit::Millisecond, None), - )?; + ) + .get_input_types()?; assert_eq!(lhs.to_string(), "Timestamp(Millisecond, None)"); assert_eq!(rhs.to_string(), "Timestamp(Millisecond, None)"); - let err = get_input_types(&DataType::Date32, &Operator::Plus, &DataType::Date64) - .unwrap_err() - .to_string(); + let err = + BinaryTypeCoercer::new(&DataType::Date32, &Operator::Plus, &DataType::Date64) + .get_input_types() + .unwrap_err() + .to_string(); assert_contains!( &err, @@ -2195,11 +2261,12 @@ mod tests { DataType::Timestamp(TimeUnit::Microsecond, None), true, )); - let result_type = get_input_types( + let result_type = BinaryTypeCoercer::new( &DataType::List(Arc::clone(&inner_field)), &Operator::Eq, &DataType::List(Arc::clone(&inner_timestamp_field)), - ); + ) + .get_input_types(); assert!(result_type.is_err()); // TODO add other data type diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index 7070761f6383..305519a1f4b4 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -35,7 +35,7 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeContainer, TreeNodeRecursion, }; use datafusion_common::{ - plan_err, Column, DFSchema, HashMap, Result, ScalarValue, TableReference, + plan_err, Column, DFSchema, HashMap, Result, ScalarValue, Spans, TableReference, }; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use sqlparser::ast::{ @@ -1111,7 +1111,11 @@ impl Expr { /// output schema. We can use this qualified name to reference the field. pub fn qualified_name(&self) -> (Option, String) { match self { - Expr::Column(Column { relation, name }) => (relation.clone(), name.clone()), + Expr::Column(Column { + relation, + name, + spans: _, + }) => (relation.clone(), name.clone()), Expr::Alias(Alias { relation, name, .. }) => (relation.clone(), name.clone()), _ => (None, self.schema_name().to_string()), } @@ -1663,6 +1667,16 @@ impl Expr { | Expr::Placeholder(..) => false, } } + + /// Returns a reference to the set of locations in the SQL query where this + /// expression appears, if known. [`None`] is returned if the expression + /// type doesn't support tracking locations yet. + pub fn spans(&self) -> Option<&Spans> { + match self { + Expr::Column(col) => Some(&col.spans), + _ => None, + } + } } impl Normalizeable for Expr { diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 335d5587bee7..0b1b10b4a43e 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -157,10 +157,7 @@ pub fn unnormalize_col(expr: Expr) -> Expr { expr.transform(|expr| { Ok({ if let Expr::Column(c) = expr { - let col = Column { - relation: None, - name: c.name, - }; + let col = Column::new_unqualified(c.name); Transformed::yes(Expr::Column(col)) } else { Transformed::no(expr) @@ -181,10 +178,7 @@ pub fn create_col_from_scalar_expr( Some::(subqry_alias.into()), name, )), - Expr::Column(Column { relation: _, name }) => Ok(Column::new( - Some::(subqry_alias.into()), - name, - )), + Expr::Column(col) => Ok(col.with_relation(subqry_alias.into())), _ => { let scalar_column = scalar_expr.schema_name().to_string(); Ok(Column::new( diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index f0d3d8fcd0c1..0044b6cf6f37 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -98,10 +98,7 @@ fn rewrite_in_terms_of_projection( // for a column with the same "MIN(C2)", so translate there let name = normalized_expr.schema_name().to_string(); - let search_col = Expr::Column(Column { - relation: None, - name, - }); + let search_col = Expr::Column(Column::new_unqualified(name)); // look for the column named the same as this expr if let Some(found) = proj_exprs.iter().find(|a| expr_match(&search_col, a)) { diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 08eb06160c09..49791427131f 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -20,7 +20,6 @@ use crate::expr::{ AggregateFunction, Alias, BinaryExpr, Cast, InList, InSubquery, Placeholder, ScalarFunction, TryCast, Unnest, WindowFunction, }; -use crate::type_coercion::binary::get_result_type; use crate::type_coercion::functions::{ data_types_with_aggregate_udf, data_types_with_scalar_udf, data_types_with_window_udf, }; @@ -32,6 +31,7 @@ use datafusion_common::{ not_impl_err, plan_datafusion_err, plan_err, Column, DataFusionError, ExprSchema, Result, TableReference, }; +use datafusion_expr_common::type_coercion::binary::BinaryTypeCoercer; use datafusion_functions_window_common::field::WindowUDFFieldArgs; use std::collections::HashMap; use std::sync::Arc; @@ -195,7 +195,12 @@ impl ExprSchemable for Expr { ref left, ref right, ref op, - }) => get_result_type(&left.get_type(schema)?, op, &right.get_type(schema)?), + }) => BinaryTypeCoercer::new( + &left.get_type(schema)?, + op, + &right.get_type(schema)?, + ) + .get_result_type(), Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean), Expr::Placeholder(Placeholder { data_type, .. }) => { if let Some(dtype) = data_type { @@ -387,9 +392,12 @@ impl ExprSchemable for Expr { ref right, ref op, }) => { - let left = left.data_type_and_nullable(schema)?; - let right = right.data_type_and_nullable(schema)?; - Ok((get_result_type(&left.0, op, &right.0)?, left.1 || right.1)) + let (lhs_type, lhs_nullable) = left.data_type_and_nullable(schema)?; + let (rhs_type, rhs_nullable) = right.data_type_and_nullable(schema)?; + let mut coercer = BinaryTypeCoercer::new(&lhs_type, op, &rhs_type); + coercer.set_lhs_spans(left.spans().cloned().unwrap_or_default()); + coercer.set_rhs_spans(right.spans().cloned().unwrap_or_default()); + Ok((coercer.get_result_type()?, lhs_nullable || rhs_nullable)) } Expr::WindowFunction(window_function) => { self.data_type_and_nullable_with_window_function(schema, window_function) diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index b0c28e145525..6203793ec698 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -2194,6 +2194,7 @@ mod tests { Column { relation: Some(TableReference::Bare { table }), name, + spans: _, }, }, _, diff --git a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs index 9fbe54e1ccb9..7df4e970ada2 100644 --- a/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/expand_wildcard_rule.rs @@ -123,6 +123,8 @@ fn expand_exprlist(input: &LogicalPlan, expr: Vec) -> Result> { Expr::Column(Column { ref relation, ref name, + // TODO Should we use these spans? + spans: _, }) => { if name.eq("*") { if let Some(qualifier) = relation { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 7a41f54c56e1..5097ff37643d 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -19,6 +19,7 @@ use std::sync::Arc; +use datafusion_expr::binary::BinaryTypeCoercer; use itertools::izip; use arrow::datatypes::{DataType, Field, IntervalUnit, Schema}; @@ -38,9 +39,7 @@ use datafusion_expr::expr::{ use datafusion_expr::expr_rewriter::coerce_plan_expr_for_schema; use datafusion_expr::expr_schema::cast_subquery; use datafusion_expr::logical_plan::Subquery; -use datafusion_expr::type_coercion::binary::{ - comparison_coercion, get_input_types, like_coercion, -}; +use datafusion_expr::type_coercion::binary::{comparison_coercion, like_coercion}; use datafusion_expr::type_coercion::functions::{ data_types_with_aggregate_udf, data_types_with_scalar_udf, }; @@ -288,11 +287,12 @@ impl<'a> TypeCoercionRewriter<'a> { right: Expr, right_schema: &DFSchema, ) -> Result<(Expr, Expr)> { - let (left_type, right_type) = get_input_types( + let (left_type, right_type) = BinaryTypeCoercer::new( &left.get_type(left_schema)?, &op, &right.get_type(right_schema)?, - )?; + ) + .get_input_types()?; Ok(( left.cast_to(&left_type, left_schema)?, right.cast_to(&right_type, right_schema)?, @@ -747,7 +747,8 @@ fn coerce_window_frame( // The above op will be rewrite to the binary op when creating the physical op. fn get_casted_expr_for_bool_op(expr: Expr, schema: &DFSchema) -> Result { let left_type = expr.get_type(schema)?; - get_input_types(&left_type, &Operator::IsDistinctFrom, &DataType::Boolean)?; + BinaryTypeCoercer::new(&left_type, &Operator::IsDistinctFrom, &DataType::Boolean) + .get_input_types()?; expr.cast_to(&DataType::Boolean, schema) } @@ -1011,6 +1012,7 @@ fn project_with_column_index( Expr::Column(Column { relation: _, ref name, + spans: _, }) if name != schema.field(i).name() => Ok(e.alias(schema.field(i).name())), Expr::Alias { .. } | Expr::Column { .. } => Ok(e), Expr::Wildcard { .. } => { diff --git a/datafusion/optimizer/src/decorrelate.rs b/datafusion/optimizer/src/decorrelate.rs index ee6ea08b43bf..36da61f4e8fe 100644 --- a/datafusion/optimizer/src/decorrelate.rs +++ b/datafusion/optimizer/src/decorrelate.rs @@ -533,7 +533,11 @@ fn proj_exprs_evaluation_result_on_empty_batch( let result_expr = simplifier.simplify(result_expr)?; let expr_name = match expr { Expr::Alias(Alias { name, .. }) => name.to_string(), - Expr::Column(Column { relation: _, name }) => name.to_string(), + Expr::Column(Column { + relation: _, + name, + spans: _, + }) => name.to_string(), _ => expr.schema_name().to_string(), }; expr_result_map_for_count_bug.insert(expr_name, result_expr); diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index 8aa45063c84f..1713842f410e 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -33,9 +33,9 @@ use arrow::datatypes::*; use arrow_schema::ArrowError; use datafusion_common::cast::as_boolean_array; use datafusion_common::{internal_err, Result, ScalarValue}; +use datafusion_expr::binary::BinaryTypeCoercer; use datafusion_expr::interval_arithmetic::{apply_operator, Interval}; use datafusion_expr::sort_properties::ExprProperties; -use datafusion_expr::type_coercion::binary::get_result_type; use datafusion_expr::{ColumnarValue, Operator}; use datafusion_physical_expr_common::datum::{apply, apply_cmp, apply_cmp_for_nested}; @@ -278,11 +278,12 @@ impl PhysicalExpr for BinaryExpr { } fn data_type(&self, input_schema: &Schema) -> Result { - get_result_type( + BinaryTypeCoercer::new( &self.left.data_type(input_schema)?, &self.op, &self.right.data_type(input_schema)?, ) + .get_result_type() } fn nullable(&self, input_schema: &Schema) -> Result { @@ -732,7 +733,6 @@ mod tests { use super::*; use crate::expressions::{col, lit, try_cast, Column, Literal}; use datafusion_common::plan_datafusion_err; - use datafusion_expr::type_coercion::binary::get_input_types; /// Performs a binary operation, applying any type coercion necessary fn binary_op( @@ -743,7 +743,8 @@ mod tests { ) -> Result> { let left_type = left.data_type(schema)?; let right_type = right.data_type(schema)?; - let (lhs, rhs) = get_input_types(&left_type, &op, &right_type)?; + let (lhs, rhs) = + BinaryTypeCoercer::new(&left_type, &op, &right_type).get_input_types()?; let left_expr = try_cast(left, schema, lhs)?; let right_expr = try_cast(right, schema, rhs)?; @@ -847,7 +848,7 @@ mod tests { ]); let a = $A_ARRAY::from($A_VEC); let b = $B_ARRAY::from($B_VEC); - let (lhs, rhs) = get_input_types(&$A_TYPE, &$OP, &$B_TYPE)?; + let (lhs, rhs) = BinaryTypeCoercer::new(&$A_TYPE, &$OP, &$B_TYPE).get_input_types()?; let left = try_cast(col("a", &schema)?, &schema, lhs)?; let right = try_cast(col("b", &schema)?, &schema, rhs)?; diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index 9adf14459081..130cdf083da3 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -16,14 +16,13 @@ // under the License. use arrow_schema::Field; -use sqlparser::ast::{Expr as SQLExpr, Ident}; - use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, - DataFusionError, Result, TableReference, + DataFusionError, Result, Span, TableReference, }; use datafusion_expr::planner::PlannerResult; use datafusion_expr::{Case, Expr}; +use sqlparser::ast::{Expr as SQLExpr, Ident}; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_expr::UNNAMED_TABLE; @@ -35,6 +34,7 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { + let id_span = id.span; if id.value.starts_with('@') { // TODO: figure out if ScalarVariables should be insensitive. let var_names = vec![id.value]; @@ -56,10 +56,16 @@ impl SqlToRel<'_, S> { if let Ok((qualifier, _)) = schema.qualified_field_with_unqualified_name(normalize_ident.as_str()) { - return Ok(Expr::Column(Column { - relation: qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), - name: normalize_ident, - })); + let mut column = Column::new( + qualifier.filter(|q| q.table() != UNNAMED_TABLE).cloned(), + normalize_ident, + ); + if self.options.collect_spans { + if let Some(span) = Span::try_from_sqlparser_span(id_span) { + column.spans_mut().add_span(span); + } + } + return Ok(Expr::Column(column)); } // Check the outer query schema @@ -76,10 +82,13 @@ impl SqlToRel<'_, S> { } // Default case - Ok(Expr::Column(Column { - relation: None, - name: normalize_ident, - })) + let mut column = Column::new_unqualified(normalize_ident); + if self.options.collect_spans { + if let Some(span) = Span::try_from_sqlparser_span(id_span) { + column.spans_mut().add_span(span); + } + } + Ok(Expr::Column(column)) } } @@ -93,6 +102,11 @@ impl SqlToRel<'_, S> { return internal_err!("Not a compound identifier: {ids:?}"); } + let ids_span = Span::union_iter( + ids.iter() + .filter_map(|id| Span::try_from_sqlparser_span(id.span)), + ); + if ids[0].value.starts_with('@') { let var_names: Vec<_> = ids .into_iter() @@ -136,7 +150,13 @@ impl SqlToRel<'_, S> { } // Found matching field with no spare identifier(s) Some((field, qualifier, _nested_names)) => { - Ok(Expr::Column(Column::from((qualifier, field)))) + let mut column = Column::from((qualifier, field)); + if self.options.collect_spans { + if let Some(span) = ids_span { + column.spans_mut().add_span(span); + } + } + Ok(Expr::Column(column)) } None => { // Return default where use all identifiers to not have a nested field @@ -179,7 +199,13 @@ impl SqlToRel<'_, S> { let s = &ids[0..ids.len()]; // Safe unwrap as s can never be empty or exceed the bounds let (relation, column_name) = form_identifier(s).unwrap(); - Ok(Expr::Column(Column::new(relation, column_name))) + let mut column = Column::new(relation, column_name); + if self.options.collect_spans { + if let Some(span) = ids_span { + column.spans_mut().add_span(span); + } + } + Ok(Expr::Column(column)) } } } diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 9b40ebdaf6a5..eb62903a0493 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -165,6 +165,7 @@ impl SqlToRel<'_, S> { schema: &DFSchema, planner_context: &mut PlannerContext, ) -> Result { + // The location of the original SQL expression in the source code let mut expr = self.sql_expr_to_logical_expr(sql, schema, planner_context)?; expr = self.rewrite_partial_qualifier(expr, schema); self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr))?; diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index f185d65fa194..e836fa89bf75 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -284,10 +284,10 @@ impl<'a> DFParser<'a> { dialect: &'a dyn Dialect, ) -> Result { let mut tokenizer = Tokenizer::new(dialect, sql); - let tokens = tokenizer.tokenize()?; + let tokens = tokenizer.tokenize_with_location()?; Ok(DFParser { - parser: Parser::new(dialect).with_tokens(tokens), + parser: Parser::new(dialect).with_tokens_with_locations(tokens), }) } diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 628bcb2fbdcd..f22ff6d94fc2 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -22,7 +22,8 @@ use std::vec; use arrow_schema::*; use datafusion_common::{ - field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, SchemaError, + field_not_found, internal_err, plan_datafusion_err, DFSchemaRef, Diagnostic, + SchemaError, }; use sqlparser::ast::TimezoneInfo; use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; @@ -30,10 +31,7 @@ use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; use datafusion_common::TableReference; -use datafusion_common::{ - not_impl_err, plan_err, unqualified_field_not_found, DFSchema, DataFusionError, - Result, -}; +use datafusion_common::{not_impl_err, plan_err, DFSchema, DataFusionError, Result}; use datafusion_expr::logical_plan::{LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::utils::find_column_exprs; use datafusion_expr::{col, Expr}; @@ -42,12 +40,13 @@ use crate::utils::make_decimal_type; pub use datafusion_expr::planner::ContextProvider; /// SQL parser options -#[derive(Debug)] +#[derive(Debug, Clone, Copy)] pub struct ParserOptions { pub parse_float_as_decimal: bool, pub enable_ident_normalization: bool, pub support_varchar_with_length: bool, pub enable_options_value_normalization: bool, + pub collect_spans: bool, } impl Default for ParserOptions { @@ -57,6 +56,7 @@ impl Default for ParserOptions { enable_ident_normalization: true, support_varchar_with_length: true, enable_options_value_normalization: false, + collect_spans: false, } } } @@ -353,20 +353,41 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .iter() .try_for_each(|col| match col { Expr::Column(col) => match &col.relation { - Some(r) => { - schema.field_with_qualified_name(r, &col.name)?; - Ok(()) - } + Some(r) => schema.field_with_qualified_name(r, &col.name).map(|_| ()), None => { if !schema.fields_with_unqualified_name(&col.name).is_empty() { Ok(()) } else { - Err(unqualified_field_not_found(col.name.as_str(), schema)) + Err(field_not_found( + col.relation.clone(), + col.name.as_str(), + schema, + )) } } } - .map_err(|_: DataFusionError| { - field_not_found(col.relation.clone(), col.name.as_str(), schema) + .map_err(|err: DataFusionError| match &err { + DataFusionError::SchemaError( + SchemaError::FieldNotFound { .. }, + _, + ) => { + let diagnostic = if let Some(relation) = &col.relation { + Diagnostic::new_error( + format!( + "column '{}' not found in '{}'", + &col.name, relation + ), + col.spans().first(), + ) + } else { + Diagnostic::new_error( + format!("column '{}' not found", &col.name), + col.spans().first(), + ) + }; + err.with_diagnostic(diagnostic) + } + _ => err, }), _ => internal_err!("Not a column"), }) diff --git a/datafusion/sql/src/relation/mod.rs b/datafusion/sql/src/relation/mod.rs index 8915b0069269..800dd151a124 100644 --- a/datafusion/sql/src/relation/mod.rs +++ b/datafusion/sql/src/relation/mod.rs @@ -20,11 +20,13 @@ use std::sync::Arc; use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use datafusion_common::tree_node::{Transformed, TreeNode}; -use datafusion_common::{not_impl_err, plan_err, DFSchema, Result, TableReference}; +use datafusion_common::{ + not_impl_err, plan_err, DFSchema, Diagnostic, Result, Span, TableReference, +}; use datafusion_expr::builder::subquery_alias; use datafusion_expr::{expr::Unnest, Expr, LogicalPlan, LogicalPlanBuilder}; use datafusion_expr::{Subquery, SubqueryAlias}; -use sqlparser::ast::{FunctionArg, FunctionArgExpr, TableFactor}; +use sqlparser::ast::{FunctionArg, FunctionArgExpr, Spanned, TableFactor}; mod join; @@ -35,6 +37,7 @@ impl SqlToRel<'_, S> { relation: TableFactor, planner_context: &mut PlannerContext, ) -> Result { + let relation_span = relation.span(); let (plan, alias) = match relation { TableFactor::Table { name, alias, args, .. @@ -80,11 +83,19 @@ impl SqlToRel<'_, S> { self.context_provider.get_table_source(table_ref.clone()), ) { (Some(cte_plan), _) => Ok(cte_plan.clone()), - (_, Ok(provider)) => { - LogicalPlanBuilder::scan(table_ref, provider, None)? - .build() + (_, Ok(provider)) => LogicalPlanBuilder::scan( + table_ref.clone(), + provider, + None, + )? + .build(), + (None, Err(e)) => { + let e = e.with_diagnostic(Diagnostic::new_error( + format!("table '{}' not found", table_ref), + Span::try_from_sqlparser_span(relation_span), + )); + Err(e) } - (None, Err(e)) => Err(e), }?, alias, ) diff --git a/datafusion/sql/src/select.rs b/datafusion/sql/src/select.rs index b160c9d8748d..50d89dea6763 100644 --- a/datafusion/sql/src/select.rs +++ b/datafusion/sql/src/select.rs @@ -22,6 +22,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use crate::utils::{ check_columns_satisfy_exprs, extract_aliases, rebase_expr, resolve_aliases_to_exprs, resolve_columns, resolve_positions_to_exprs, rewrite_recursive_unnests_bottom_up, + CheckColumnsSatisfyExprsPurpose, }; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; @@ -796,7 +797,7 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, &select_exprs_post_aggr, - "Projection references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate, )?; // Rewrite the HAVING expression to use the columns produced by the @@ -808,7 +809,7 @@ impl SqlToRel<'_, S> { check_columns_satisfy_exprs( &column_exprs_post_aggr, std::slice::from_ref(&having_expr_post_aggr), - "HAVING clause references non-aggregate values", + CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate, )?; Some(having_expr_post_aggr) diff --git a/datafusion/sql/src/set_expr.rs b/datafusion/sql/src/set_expr.rs index 75fdbd20e840..290c0174784a 100644 --- a/datafusion/sql/src/set_expr.rs +++ b/datafusion/sql/src/set_expr.rs @@ -16,9 +16,9 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use datafusion_common::{not_impl_err, Result}; +use datafusion_common::{not_impl_err, plan_err, Diagnostic, Result, Span}; use datafusion_expr::{LogicalPlan, LogicalPlanBuilder}; -use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier}; +use sqlparser::ast::{SetExpr, SetOperator, SetQuantifier, Spanned}; impl SqlToRel<'_, S> { #[cfg_attr(feature = "recursive_protection", recursive::recursive)] @@ -27,6 +27,7 @@ impl SqlToRel<'_, S> { set_expr: SetExpr, planner_context: &mut PlannerContext, ) -> Result { + let set_expr_span = Span::try_from_sqlparser_span(set_expr.span()); match set_expr { SetExpr::Select(s) => self.select_to_plan(*s, vec![], planner_context), SetExpr::Values(v) => self.sql_values_to_plan(v, planner_context), @@ -36,8 +37,18 @@ impl SqlToRel<'_, S> { right, set_quantifier, } => { + let left_span = Span::try_from_sqlparser_span(left.span()); + let right_span = Span::try_from_sqlparser_span(right.span()); let left_plan = self.set_expr_to_plan(*left, planner_context)?; let right_plan = self.set_expr_to_plan(*right, planner_context)?; + self.validate_set_expr_num_of_columns( + op, + left_span, + right_span, + &left_plan, + &right_plan, + set_expr_span, + )?; self.set_operation_to_plan(op, left_plan, right_plan, set_quantifier) } SetExpr::Query(q) => self.query_to_plan(*q, planner_context), @@ -61,6 +72,40 @@ impl SqlToRel<'_, S> { } } + fn validate_set_expr_num_of_columns( + &self, + op: SetOperator, + left_span: Option, + right_span: Option, + left_plan: &LogicalPlan, + right_plan: &LogicalPlan, + set_expr_span: Option, + ) -> Result<()> { + if left_plan.schema().fields().len() == right_plan.schema().fields().len() { + return Ok(()); + } + + plan_err!("{} queries have different number of columns", op).map_err(|err| { + err.with_diagnostic( + Diagnostic::new_error( + format!("{} queries have different number of columns", op), + set_expr_span, + ) + .with_note( + format!("this side has {} fields", left_plan.schema().fields().len()), + left_span, + ) + .with_note( + format!( + "this side has {} fields", + right_plan.schema().fields().len() + ), + right_span, + ), + ) + }) + } + pub(super) fn set_operation_to_plan( &self, op: SetOperator, diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 96e1cec001cb..0aa3d4d3bc92 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -1629,7 +1629,7 @@ mod tests { use arrow::datatypes::{Field, Schema}; use arrow_schema::DataType::Int8; use ast::ObjectName; - use datafusion_common::TableReference; + use datafusion_common::{Spans, TableReference}; use datafusion_expr::expr::WildcardOptions; use datafusion_expr::{ case, cast, col, cube, exists, grouping_set, interval_datetime_lit, @@ -1711,6 +1711,7 @@ mod tests { Expr::Column(Column { relation: Some(TableReference::partial("a", "b")), name: "c".to_string(), + spans: Spans::new(), }) .gt(lit(4)), r#"(b.c > 4)"#, @@ -2042,6 +2043,7 @@ mod tests { expr: Box::new(Expr::Column(Column { relation: Some(TableReference::partial("schema", "table")), name: "array_col".to_string(), + spans: Spans::new(), })), }), r#"UNNEST("table".array_col)"#, diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 2e02baa4f7f5..368131751e91 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -1096,10 +1096,12 @@ impl Unparser<'_> { Expr::Column(Column { relation: _, name: left_name, + spans: _, }), Expr::Column(Column { relation: _, name: right_name, + spans: _, }), ) if left_name == right_name => { idents.push(self.new_ident_quoted_if_needs(left_name.to_string())); diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 74ead8cf94ea..ab3e75a960a0 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -26,8 +26,8 @@ use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, TreeNodeRewriter, }; use datafusion_common::{ - exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, HashMap, - Result, ScalarValue, + exec_err, internal_err, plan_err, Column, DFSchemaRef, DataFusionError, Diagnostic, + HashMap, Result, ScalarValue, }; use datafusion_expr::builder::get_struct_unnested_columns; use datafusion_expr::expr::{Alias, GroupingSet, Unnest, WindowFunction}; @@ -90,12 +90,35 @@ pub(crate) fn rebase_expr( .data() } +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub(crate) enum CheckColumnsSatisfyExprsPurpose { + ProjectionMustReferenceAggregate, + HavingMustReferenceAggregate, +} + +impl CheckColumnsSatisfyExprsPurpose { + fn message_prefix(&self) -> &'static str { + match self { + CheckColumnsSatisfyExprsPurpose::ProjectionMustReferenceAggregate => { + "Projection references non-aggregate values" + } + CheckColumnsSatisfyExprsPurpose::HavingMustReferenceAggregate => { + "HAVING clause references non-aggregate values" + } + } + } + + fn diagnostic_message(&self, expr: &Expr) -> String { + format!("'{expr}' must appear in GROUP BY clause because it's not an aggregate expression") + } +} + /// Determines if the set of `Expr`'s are a valid projection on the input /// `Expr::Column`'s. pub(crate) fn check_columns_satisfy_exprs( columns: &[Expr], exprs: &[Expr], - message_prefix: &str, + purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { columns.iter().try_for_each(|c| match c { Expr::Column(_) => Ok(()), @@ -106,22 +129,22 @@ pub(crate) fn check_columns_satisfy_exprs( match e { Expr::GroupingSet(GroupingSet::Rollup(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } Expr::GroupingSet(GroupingSet::Cube(exprs)) => { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } Expr::GroupingSet(GroupingSet::GroupingSets(lists_of_exprs)) => { for exprs in lists_of_exprs { for e in exprs { - check_column_satisfies_expr(columns, e, message_prefix)?; + check_column_satisfies_expr(columns, e, purpose)?; } } } - _ => check_column_satisfies_expr(columns, e, message_prefix)?, + _ => check_column_satisfies_expr(columns, e, purpose)?, } } Ok(()) @@ -130,15 +153,23 @@ pub(crate) fn check_columns_satisfy_exprs( fn check_column_satisfies_expr( columns: &[Expr], expr: &Expr, - message_prefix: &str, + purpose: CheckColumnsSatisfyExprsPurpose, ) -> Result<()> { if !columns.contains(expr) { return plan_err!( "{}: Expression {} could not be resolved from available columns: {}", - message_prefix, + purpose.message_prefix(), expr, expr_vec_fmt!(columns) - ); + ) + .map_err(|err| { + let diagnostic = Diagnostic::new_error( + purpose.diagnostic_message(expr), + expr.spans().and_then(|spans| spans.first()), + ) + .with_help(format!("add '{expr}' to GROUP BY clause"), None); + err.with_diagnostic(diagnostic) + }); } Ok(()) } diff --git a/datafusion/sql/tests/cases/diagnostic.rs b/datafusion/sql/tests/cases/diagnostic.rs new file mode 100644 index 000000000000..55d3a953a728 --- /dev/null +++ b/datafusion/sql/tests/cases/diagnostic.rs @@ -0,0 +1,227 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; + +use datafusion_common::{Diagnostic, Location, Result, Span}; +use datafusion_sql::planner::{ParserOptions, SqlToRel}; +use regex::Regex; +use sqlparser::{dialect::GenericDialect, parser::Parser}; + +use crate::{MockContextProvider, MockSessionState}; + +fn do_query(sql: &'static str) -> Diagnostic { + let dialect = GenericDialect {}; + let statement = Parser::new(&dialect) + .try_with_sql(sql) + .expect("unable to create parser") + .parse_statement() + .expect("unable to parse query"); + let options = ParserOptions { + collect_spans: true, + ..ParserOptions::default() + }; + let state = MockSessionState::default(); + let context = MockContextProvider { state }; + let sql_to_rel = SqlToRel::new_with_options(&context, options); + match sql_to_rel.sql_statement_to_plan(statement) { + Ok(_) => panic!("expected error"), + Err(err) => match err.diagnostic() { + Some(diag) => diag.clone(), + None => panic!("expected diagnostic"), + }, + } +} + +/// Given a query that contains tag delimited spans, returns a mapping from the +/// span name to the [`Span`]. Tags are comments of the form `/*tag*/`. In case +/// you want the same location to open two spans, or close open and open +/// another, you can use '+' to separate the names (the order matters). The +/// names of spans can only contain letters, digits, underscores, and dashes. +/// +/// +/// ## Example +/// +/// ```rust +/// let spans = get_spans("SELECT /*myspan*/car/*myspan*/ FROM cars"); +/// // myspan is ^^^ +/// dbg!(&spans["myspan"]); +/// ``` +/// +/// ## Example +/// +/// ```rust +/// let spans = get_spans("SELECT /*whole+left*/speed/*left*/ + /*right*/10/*right+whole*/ FROM cars"); +/// // whole is ^^^^^^^^^^^^^^^^^^^^^^^^^^^ +/// // left is ^^^^^ +/// // right is ^^ +/// dbg!(&spans["whole"]); +/// dbg!(&spans["left"]); +/// dbg!(&spans["right"]); +/// ``` +fn get_spans(query: &'static str) -> HashMap { + let mut spans = HashMap::new(); + + let mut bytes_per_line = vec![]; + for line in query.lines() { + bytes_per_line.push(line.len()); + } + + let byte_offset_to_loc = |s: &str, byte_offset: usize| -> Location { + let mut line = 1; + let mut column = 1; + for (i, c) in s.chars().enumerate() { + if i == byte_offset { + return Location { line, column }; + } + if c == '\n' { + line += 1; + column = 1; + } else { + column += 1; + } + } + Location { line, column } + }; + + let re = Regex::new(r#"/\*([\w\d\+_-]+)\*/"#).unwrap(); + let mut stack: Vec<(String, usize)> = vec![]; + for c in re.captures_iter(query) { + let m = c.get(0).unwrap(); + let tags = c.get(1).unwrap().as_str().split("+").collect::>(); + + for tag in tags { + if stack.last().map(|(top_tag, _)| top_tag.as_str()) == Some(tag) { + let (_, start) = stack.pop().unwrap(); + let end = m.start(); + + spans.insert( + tag.to_string(), + Span::new( + byte_offset_to_loc(query, start), + byte_offset_to_loc(query, end), + ), + ); + } else { + stack.push((tag.to_string(), m.end())); + } + } + } + + if !stack.is_empty() { + panic!("unbalanced tags"); + } + + spans +} + +#[test] +fn test_table_not_found() -> Result<()> { + let query = "SELECT * FROM /*a*/personx/*a*/"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "table 'personx' not found"); + assert_eq!(diag.span, Some(spans["a"])); + Ok(()) +} + +#[test] +fn test_unqualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/first_namex/*a*/ FROM person"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_namex' not found"); + assert_eq!(diag.span, Some(spans["a"])); + Ok(()) +} + +#[test] +fn test_qualified_column_not_found() -> Result<()> { + let query = "SELECT /*a*/person.first_namex/*a*/ FROM person"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_namex' not found in 'person'"); + assert_eq!(diag.span, Some(spans["a"])); + Ok(()) +} + +#[test] +fn test_union_wrong_number_of_columns() -> Result<()> { + let query = "/*whole+left*/SELECT first_name FROM person/*left*/ UNION ALL /*right*/SELECT first_name, last_name FROM person/*right+whole*/"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!( + diag.message, + "UNION queries have different number of columns" + ); + assert_eq!(diag.span, Some(spans["whole"])); + assert_eq!(diag.notes[0].message, "this side has 1 fields"); + assert_eq!(diag.notes[0].span, Some(spans["left"])); + assert_eq!(diag.notes[1].message, "this side has 2 fields"); + assert_eq!(diag.notes[1].span, Some(spans["right"])); + Ok(()) +} + +#[test] +fn test_missing_non_aggregate_in_group_by() -> Result<()> { + let query = "SELECT id, /*a*/first_name/*a*/ FROM person GROUP BY id"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!( + diag.message, + "'person.first_name' must appear in GROUP BY clause because it's not an aggregate expression" + ); + assert_eq!(diag.span, Some(spans["a"])); + assert_eq!( + diag.helps[0].message, + "add 'person.first_name' to GROUP BY clause" + ); + Ok(()) +} + +#[test] +fn test_ambiguous_reference() -> Result<()> { + let query = "SELECT /*a*/first_name/*a*/ FROM person a, person b"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "column 'first_name' is ambiguous"); + assert_eq!(diag.span, Some(spans["a"])); + assert_eq!( + diag.notes[0].message, + "possible reference to 'first_name' in table 'a'" + ); + assert_eq!( + diag.notes[1].message, + "possible reference to 'first_name' in table 'b'" + ); + Ok(()) +} + +#[test] +fn test_incompatible_types_binary_arithmetic() -> Result<()> { + let query = + "SELECT /*whole+left*/id/*left*/ + /*right*/first_name/*right+whole*/ FROM person"; + let spans = get_spans(query); + let diag = do_query(query); + assert_eq!(diag.message, "expressions have incompatible types"); + assert_eq!(diag.span, Some(spans["whole"])); + assert_eq!(diag.notes[0].message, "has type UInt32"); + assert_eq!(diag.notes[0].span, Some(spans["left"])); + assert_eq!(diag.notes[1].message, "has type Utf8"); + assert_eq!(diag.notes[1].span, Some(spans["right"])); + Ok(()) +} diff --git a/datafusion/sql/tests/cases/mod.rs b/datafusion/sql/tests/cases/mod.rs index fc4c59cc88d8..48574984c0ca 100644 --- a/datafusion/sql/tests/cases/mod.rs +++ b/datafusion/sql/tests/cases/mod.rs @@ -15,4 +15,5 @@ // specific language governing permissions and limitations // under the License. +mod diagnostic; mod plan_to_sql; diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index f6533dceca4f..b9502c152004 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -94,6 +94,7 @@ fn parse_decimals() { enable_ident_normalization: false, support_varchar_with_length: false, enable_options_value_normalization: false, + collect_spans: false, }, ); } @@ -149,6 +150,7 @@ fn parse_ident_normalization() { enable_ident_normalization, support_varchar_with_length: false, enable_options_value_normalization: false, + collect_spans: false, }, ); if plan.is_ok() { @@ -3630,10 +3632,10 @@ fn test_prepare_statement_to_plan_panic_prepare_wrong_syntax() { // param is not number following the $ sign // panic due to error returned from the parser let sql = "PREPARE AS SELECT id, age FROM person WHERE age = $foo"; - assert_eq!( - logical_plan(sql).unwrap_err().strip_backtrace(), - "SQL error: ParserError(\"Expected: AS, found: SELECT\")" - ) + assert!(logical_plan(sql) + .unwrap_err() + .strip_backtrace() + .contains("Expected: AS, found: SELECT")) } #[test] @@ -3671,7 +3673,7 @@ fn test_non_prepare_statement_should_infer_types() { #[test] #[should_panic( - expected = "value: SQL(ParserError(\"Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: $1\"" + expected = "Expected: [NOT] NULL or TRUE|FALSE or [NOT] DISTINCT FROM after IS, found: $1" )] fn test_prepare_statement_to_plan_panic_is_param() { let sql = "PREPARE my_plan(INT) AS SELECT id, age FROM person WHERE age is $1"; @@ -4406,6 +4408,10 @@ fn plan_create_index() { } fn assert_field_not_found(err: DataFusionError, name: &str) { + let err = match err { + DataFusionError::Diagnostic(_, err) => *err, + err => err, + }; match err { DataFusionError::SchemaError { .. } => { let msg = format!("{err}"); diff --git a/datafusion/sqllogictest/test_files/create_external_table.slt b/datafusion/sqllogictest/test_files/create_external_table.slt index 6a63ea1cd3e4..8328f9bbd295 100644 --- a/datafusion/sqllogictest/test_files/create_external_table.slt +++ b/datafusion/sqllogictest/test_files/create_external_table.slt @@ -33,23 +33,23 @@ statement error DataFusion error: SQL error: ParserError\("Missing LOCATION clau CREATE EXTERNAL TABLE t STORED AS CSV # Option value is missing -statement error DataFusion error: SQL error: ParserError\("Expected: string or numeric value, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: string or numeric value, found: \) at Line: 1, Column: 66"\) CREATE EXTERNAL TABLE t STORED AS x OPTIONS ('k1' 'v1', k2 v2, k3) LOCATION 'blahblah' # Missing `(` in WITH ORDER clause -statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: c1"\) +statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: c1 at Line: 1, Column: 58"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER c1 LOCATION 'foo.csv' # Missing `)` in WITH ORDER clause -statement error DataFusion error: SQL error: ParserError\("Expected: \), found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: \), found: LOCATION at Line: 1, Column: 62"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH ORDER (c1 LOCATION 'foo.csv' # Missing `ROW` in WITH HEADER clause -statement error DataFusion error: SQL error: ParserError\("Expected: ROW, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: ROW, found: LOCATION at Line: 1, Column: 51"\) CREATE EXTERNAL TABLE t STORED AS CSV WITH HEADER LOCATION 'abc' # Missing `BY` in PARTITIONED clause -statement error DataFusion error: SQL error: ParserError\("Expected: BY, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: BY, found: LOCATION at Line: 1, Column: 51"\) CREATE EXTERNAL TABLE t STORED AS CSV PARTITIONED LOCATION 'abc' # Duplicate `STORED AS` clause @@ -69,11 +69,11 @@ statement error DataFusion error: SQL error: ParserError\("OPTIONS specified mor CREATE EXTERNAL TABLE t STORED AS CSV OPTIONS ('k1' 'v1', 'k2' 'v2') OPTIONS ('k3' 'v3') LOCATION 'foo.csv' # With typo error -statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: HEAD"\) +statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: HEAD at Line: 1, Column: 52"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH HEAD ROW LOCATION 'foo.csv'; # Missing `anything` in WITH clause -statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: LOCATION"\) +statement error DataFusion error: SQL error: ParserError\("Expected: HEADER, found: LOCATION at Line: 1, Column: 52"\) CREATE EXTERNAL TABLE t(c1 int) STORED AS CSV WITH LOCATION 'foo.csv'; # Unrecognized random clause diff --git a/datafusion/sqllogictest/test_files/distinct_on.slt b/datafusion/sqllogictest/test_files/distinct_on.slt index cc0ebf83a843..5aa7c89c15d5 100644 --- a/datafusion/sqllogictest/test_files/distinct_on.slt +++ b/datafusion/sqllogictest/test_files/distinct_on.slt @@ -153,7 +153,7 @@ b 1 29 -18218 994303988 5983957848665088916 204 9489 3275293996 1485709125918647 c 2 1 18109 2033001162 -6513304855495910254 25 43062 1491205016 5863949479783605708 0.110830784 0.929409733247 6WfVFBVGJSQb7FhA7E0lBwdvjfZnSW # can't distinct on * -query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \*"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \* at Line: 1, Column: 21"\) SELECT DISTINCT ON (*) c1 FROM aggregate_test_100 ORDER BY c1 LIMIT 3; diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index 4653df400080..5a1caad46732 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -257,6 +257,7 @@ datafusion.optimizer.repartition_sorts true datafusion.optimizer.repartition_windows true datafusion.optimizer.skip_failed_rules false datafusion.optimizer.top_down_join_key_reordering true +datafusion.sql_parser.collect_spans false datafusion.sql_parser.dialect generic datafusion.sql_parser.enable_ident_normalization true datafusion.sql_parser.enable_options_value_normalization false @@ -351,6 +352,7 @@ datafusion.optimizer.repartition_sorts true Should DataFusion execute sorts in a datafusion.optimizer.repartition_windows true Should DataFusion repartition data using the partitions keys to execute window functions in parallel using the provided `target_partitions` level datafusion.optimizer.skip_failed_rules false When set to true, the logical plan optimizer will produce warning messages if any optimization rules produce errors and then proceed to the next rule. When set to false, any rules that produce errors will cause the query to fail datafusion.optimizer.top_down_join_key_reordering true When set to true, the physical plan optimizer will run a top down process to reorder the join keys +datafusion.sql_parser.collect_spans false When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes. datafusion.sql_parser.dialect generic Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. datafusion.sql_parser.enable_ident_normalization true When set to true, SQL parser will normalize ident (convert ident to lowercase when not quoted) datafusion.sql_parser.enable_options_value_normalization false When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. diff --git a/datafusion/sqllogictest/test_files/insert.slt b/datafusion/sqllogictest/test_files/insert.slt index c32b4b9ea397..097284a6b71f 100644 --- a/datafusion/sqllogictest/test_files/insert.slt +++ b/datafusion/sqllogictest/test_files/insert.slt @@ -431,7 +431,7 @@ NULL 20 500 default_text statement ok drop table test_column_defaults -statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a. +statement error DataFusion error: Schema error: No field named a\. create table test_column_defaults(a int, b int default a+1) diff --git a/datafusion/sqllogictest/test_files/insert_to_external.slt b/datafusion/sqllogictest/test_files/insert_to_external.slt index 2a2aecf2f2cc..ee4dde6248db 100644 --- a/datafusion/sqllogictest/test_files/insert_to_external.slt +++ b/datafusion/sqllogictest/test_files/insert_to_external.slt @@ -620,7 +620,7 @@ statement ok drop table test_column_defaults # test invalid default value -statement error DataFusion error: Error during planning: Column reference is not allowed in the DEFAULT expression : Schema error: No field named a. +statement error DataFusion error: Schema error: No field named a\. CREATE EXTERNAL TABLE test_column_defaults( a int, b int default a+1 diff --git a/datafusion/sqllogictest/test_files/predicates.slt b/datafusion/sqllogictest/test_files/predicates.slt index 878d7c8a4dfb..d03d9a36aa8e 100644 --- a/datafusion/sqllogictest/test_files/predicates.slt +++ b/datafusion/sqllogictest/test_files/predicates.slt @@ -584,7 +584,7 @@ DROP TABLE data_index_bloom_encoding_stats; # String coercion ######## -statement error DataFusion error: SQL error: ParserError\("Expected: a data type name, found: ,"\) +statement error DataFusion error: SQL error: ParserError\("Expected: a data type name, found: , at Line: 1, Column: 30"\) CREATE TABLE t(vendor_id_utf8, vendor_id_dict) AS VALUES (arrow_cast('124', 'Utf8'), arrow_cast('124', 'Dictionary(Int16, Utf8)')), diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index 98620bd6c635..744aa7d9ec92 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -339,10 +339,10 @@ NULL 1 statement error DataFusion error: SQL error: ParserError\("Expected: \(, found: EOF"\) VALUES -statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 9"\) VALUES () -statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +statement error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 13"\) VALUES (1),() statement error DataFusion error: Error during planning: Inconsistent data length across values list: got 2 values in row 1 but expected 1 diff --git a/datafusion/sqllogictest/test_files/type_coercion.slt b/datafusion/sqllogictest/test_files/type_coercion.slt index 43e7c2f7bc25..0900c88c15c0 100644 --- a/datafusion/sqllogictest/test_files/type_coercion.slt +++ b/datafusion/sqllogictest/test_files/type_coercion.slt @@ -103,11 +103,11 @@ CREATE TABLE orders( ); # union_different_num_columns_error() / UNION -query error DataFusion error: Error during planning: UNION queries have different number of columns: left has 1 columns whereas right has 2 columns +query error DataFusion error: Error during planning: UNION queries have different number of columns SELECT order_id FROM orders UNION SELECT customer_id, o_item_id FROM orders # union_different_num_columns_error() / UNION ALL -query error DataFusion error: Error during planning: UNION queries have different number of columns: left has 1 columns whereas right has 2 columns +query error DataFusion error: Error during planning: UNION queries have different number of columns SELECT order_id FROM orders UNION ALL SELECT customer_id, o_item_id FROM orders # union_with_different_column_names() diff --git a/datafusion/sqllogictest/test_files/unnest.slt b/datafusion/sqllogictest/test_files/unnest.slt index 362f0df11a7c..a674f7b7f422 100644 --- a/datafusion/sqllogictest/test_files/unnest.slt +++ b/datafusion/sqllogictest/test_files/unnest.slt @@ -295,7 +295,7 @@ query error DataFusion error: Error during planning: unnest\(\) requires exactly select unnest(); ## Unnest empty expression in from clause -query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \)"\) +query error DataFusion error: SQL error: ParserError\("Expected: an expression, found: \) at Line: 1, Column: 22"\) select * from unnest(); diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 02c8f127d973..999735f4c059 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -126,3 +126,4 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.sql_parser.enable_options_value_normalization | false | When set to true, SQL parser will normalize options value (convert value to lowercase). Note that this option is ignored and will be removed in the future. All case-insensitive values are normalized automatically. | | datafusion.sql_parser.dialect | generic | Configure the SQL dialect used by DataFusion's parser; supported values include: Generic, MySQL, PostgreSQL, Hive, SQLite, Snowflake, Redshift, MsSQL, ClickHouse, BigQuery, and Ansi. | | datafusion.sql_parser.support_varchar_with_length | true | If true, permit lengths for `VARCHAR` such as `VARCHAR(20)`, but ignore the length. If false, error if a `VARCHAR` with a length is specified. The Arrow type system does not have a notion of maximum string length and thus DataFusion can not enforce such limits. | +| datafusion.sql_parser.collect_spans | false | When set to true, the source locations relative to the original SQL query (i.e. [`Span`](sqlparser::tokenizer::Span)) will be collected and recorded in the logical plan nodes. |