-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add related source code locations to errors #13664
Changes from 5 commits
13ad9db
4603b35
86e1722
c3532c1
91f4e3c
85db8af
3f4ddd2
b4df859
e4652d7
d2e70ad
e44cb9f
31772ed
c15d8f5
09d7065
1d5c647
5fdc10e
f63a493
6f6722d
9a8ef78
fd46dce
360ac20
3dd66fc
13c4c9d
fe74dce
829430a
2772c6b
1b35207
243b788
e98694b
e847477
60a9f8b
f178db0
1308b85
51dd141
b5e326c
1cd0f3e
20e9f61
e1e6ac5
7265bd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,85 @@ | ||
use sqlparser::tokenizer::Span; | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct Diagnostic { | ||
pub entries: Vec<DiagnosticEntry>, | ||
} | ||
|
||
#[derive(Debug, Clone)] | ||
pub struct DiagnosticEntry { | ||
pub span: Span, | ||
pub message: String, | ||
pub kind: DiagnosticEntryKind, | ||
} | ||
|
||
#[derive(Debug, Clone, Copy, PartialEq, Eq)] | ||
pub enum DiagnosticEntryKind { | ||
Error, | ||
Warning, | ||
Note, | ||
Help, | ||
} | ||
|
||
impl Diagnostic { | ||
pub fn new() -> Self { | ||
Default::default() | ||
} | ||
} | ||
|
||
impl Default for Diagnostic { | ||
fn default() -> Self { | ||
Diagnostic { | ||
entries: Vec::new(), | ||
} | ||
} | ||
} | ||
|
||
impl FromIterator<DiagnosticEntry> for Diagnostic { | ||
fn from_iter<T: IntoIterator<Item = DiagnosticEntry>>(iter: T) -> Self { | ||
Diagnostic { | ||
entries: iter.into_iter().collect(), | ||
} | ||
} | ||
} | ||
|
||
macro_rules! with_kind { | ||
($name:ident, $kind:expr) => { | ||
pub fn $name(mut self, message: impl Into<String>, span: Span) -> Self { | ||
let entry = DiagnosticEntry { | ||
span, | ||
message: message.into(), | ||
kind: $kind, | ||
}; | ||
self.entries.push(entry); | ||
self | ||
} | ||
}; | ||
} | ||
|
||
impl Diagnostic { | ||
with_kind!(with_error, DiagnosticEntryKind::Error); | ||
with_kind!(with_warning, DiagnosticEntryKind::Warning); | ||
with_kind!(with_note, DiagnosticEntryKind::Note); | ||
with_kind!(with_help, DiagnosticEntryKind::Help); | ||
} | ||
|
||
impl DiagnosticEntry { | ||
pub fn new( | ||
message: impl Into<String>, | ||
kind: DiagnosticEntryKind, | ||
span: Span, | ||
) -> Self { | ||
DiagnosticEntry { | ||
span, | ||
message: message.into(), | ||
kind, | ||
} | ||
} | ||
|
||
pub fn new_without_span( | ||
message: impl Into<String>, | ||
kind: DiagnosticEntryKind, | ||
) -> Self { | ||
Self::new(message, kind, Span::empty()) | ||
} | ||
} |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -27,7 +27,7 @@ use std::result; | |||||
use std::sync::Arc; | ||||||
|
||||||
use crate::utils::quote_identifier; | ||||||
use crate::{Column, DFSchema, TableReference}; | ||||||
use crate::{Column, DFSchema, Diagnostic, TableReference}; | ||||||
#[cfg(feature = "avro")] | ||||||
use apache_avro::Error as AvroError; | ||||||
use arrow::error::ArrowError; | ||||||
|
@@ -131,6 +131,11 @@ pub enum DataFusionError { | |||||
/// Errors from either mapping LogicalPlans to/from Substrait plans | ||||||
/// or serializing/deserializing protobytes to Substrait plans | ||||||
Substrait(String), | ||||||
/// Error wrapped together with additional contextual information intended | ||||||
/// for end users, to help them understand what went wrong by providing | ||||||
/// human-readable messages, and locations in the source query that relate | ||||||
/// to the error in some way. | ||||||
Diagnostic(Diagnostic, Box<DataFusionError>), | ||||||
} | ||||||
|
||||||
#[macro_export] | ||||||
|
@@ -328,6 +333,7 @@ impl Error for DataFusionError { | |||||
DataFusionError::External(e) => Some(e.as_ref()), | ||||||
DataFusionError::Context(_, e) => Some(e.as_ref()), | ||||||
DataFusionError::Substrait(_) => None, | ||||||
DataFusionError::Diagnostic(_, e) => Some(e.as_ref()), | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -441,6 +447,7 @@ impl DataFusionError { | |||||
DataFusionError::External(_) => "External error: ", | ||||||
DataFusionError::Context(_, _) => "", | ||||||
DataFusionError::Substrait(_) => "Substrait error: ", | ||||||
DataFusionError::Diagnostic(_, _) => "", | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -481,8 +488,56 @@ impl DataFusionError { | |||||
Cow::Owned(format!("{desc}\ncaused by\n{}", *err)) | ||||||
} | ||||||
DataFusionError::Substrait(ref desc) => Cow::Owned(desc.to_string()), | ||||||
DataFusionError::Diagnostic(_, ref err) => Cow::Owned(err.to_string()), | ||||||
} | ||||||
} | ||||||
|
||||||
/// Wraps the error with contextual information intended for end users | ||||||
pub fn with_diagnostic(self, diagnostic: Diagnostic) -> Self { | ||||||
Self::Diagnostic(diagnostic, Box::new(self)) | ||||||
} | ||||||
|
||||||
/// Wraps the error with contextual information intended for end users. | ||||||
/// Takes a function that inspects the error and returns the diagnostic to | ||||||
/// wrap it with. | ||||||
pub fn with_diagnostic_fn<F: FnOnce(&DataFusionError) -> Diagnostic>( | ||||||
self, | ||||||
f: F, | ||||||
) -> Self { | ||||||
let diagnostic = f(&self); | ||||||
self.with_diagnostic(diagnostic) | ||||||
} | ||||||
|
||||||
pub fn get_diagnostics(&self) -> impl Iterator<Item = &Diagnostic> + '_ { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Minor -- I think a more consistent API would be to call this
Suggested change
|
||||||
struct DiagnosticsIterator<'a> { | ||||||
head: &'a DataFusionError, | ||||||
} | ||||||
|
||||||
impl<'a> Iterator for DiagnosticsIterator<'a> { | ||||||
type Item = &'a Diagnostic; | ||||||
|
||||||
fn next(&mut self) -> Option<Self::Item> { | ||||||
loop { | ||||||
if let DataFusionError::Diagnostic(diagnostics, source) = self.head { | ||||||
self.head = source.as_ref(); | ||||||
return Some(diagnostics); | ||||||
} | ||||||
|
||||||
if let Some(source) = self | ||||||
.head | ||||||
.source() | ||||||
.and_then(|source| source.downcast_ref::<DataFusionError>()) | ||||||
{ | ||||||
self.head = source; | ||||||
} else { | ||||||
return None; | ||||||
} | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
DiagnosticsIterator { head: self } | ||||||
} | ||||||
} | ||||||
|
||||||
/// Unwrap an `Option` if possible. Otherwise return an `DataFusionError::Internal`. | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -15,11 +15,21 @@ | |
// specific language governing permissions and limitations | ||
// under the License. | ||
|
||
use std::ops::ControlFlow; | ||
|
||
use arrow_schema::DataType; | ||
use arrow_schema::TimeUnit; | ||
use datafusion_common::Column; | ||
use datafusion_common::DataFusionError; | ||
use datafusion_common::Diagnostic; | ||
use datafusion_common::SchemaError; | ||
use datafusion_expr::planner::{ | ||
PlannerResult, RawBinaryExpr, RawDictionaryExpr, RawFieldAccessExpr, | ||
}; | ||
use datafusion_expr::utils::find_column_exprs; | ||
use sqlparser::ast::Spanned; | ||
use sqlparser::ast::Visit; | ||
use sqlparser::ast::Visitor; | ||
use sqlparser::ast::{ | ||
BinaryOperator, CastFormat, CastKind, DataType as SQLDataType, DictionaryField, | ||
Expr as SQLExpr, ExprWithAlias as SQLExprWithAlias, MapEntry, StructField, Subscript, | ||
|
@@ -36,7 +46,9 @@ use datafusion_expr::{ | |
lit, Between, BinaryExpr, Cast, Expr, ExprSchemable, GetFieldAccess, Like, Literal, | ||
Operator, TryCast, | ||
}; | ||
use sqlparser::tokenizer::Span; | ||
|
||
use crate::planner::IdentNormalizer; | ||
use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; | ||
|
||
mod binary_op; | ||
|
@@ -165,13 +177,126 @@ impl<S: ContextProvider> SqlToRel<'_, S> { | |
schema: &DFSchema, | ||
planner_context: &mut PlannerContext, | ||
) -> Result<Expr> { | ||
let mut expr = self.sql_expr_to_logical_expr(sql, schema, planner_context)?; | ||
// The location of the original SQL expression in the source code | ||
let mut expr = | ||
self.sql_expr_to_logical_expr(sql.clone(), schema, planner_context)?; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Having to copy the entire AST just to get the span information on error is non ideal (we are trying to keep planning reasonably faster) I understand that Specifically, since the #[derive(Debug, Clone)]
pub struct PlannerContext {
...
/// the current span of the expression or statement being planned
/// Note not all statements have span information yet
/// see <https://github.com/apache/datafusion-sqlparser-rs/issues/1548>
current_span: Option<Span>,
...
} Then rather than calling The key would to manage setting/restoring the spans during the planing process. Maybe it could be something like ...
// set the `current_span` field in the planner context
// if sql has a span, otherwise use the existing span
let planner_context = planner_context.with_span(&sql);
self.sql_expr_to_logical_expr(sql.clone(), schema, planner_context)?;
... There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's a very interesting idea. I see a few issues with it though: What if What if a diagnostic needs to highlight various parts of a query. e.g. if a non-aggregated column is missing from the I thought that I could solve the above issues by putting a
The But then I encountered another issue: Some functions where For example, in the body of But, what if Then it seems like a pattern like:
might be necessary, to add a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also it's
eliaperantoni marked this conversation as resolved.
Show resolved
Hide resolved
|
||
expr = self.rewrite_partial_qualifier(expr, schema); | ||
self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr))?; | ||
let validation_result = | ||
self.validate_schema_satisfies_exprs(schema, std::slice::from_ref(&expr)); | ||
|
||
// Must do it here because `validate_schema_satisfies_exprs` doesn't | ||
// have access to the original SQL expression from the parser | ||
validation_result.map_err(|err| { | ||
if let DataFusionError::SchemaError( | ||
SchemaError::FieldNotFound { | ||
field, | ||
valid_fields: _, | ||
}, | ||
_, | ||
) = &err | ||
{ | ||
let diagnostic = self.get_field_not_found_diagnostic( | ||
&sql, | ||
field, | ||
schema, | ||
planner_context, | ||
); | ||
err.with_diagnostic(diagnostic) | ||
} else { | ||
err | ||
} | ||
})?; | ||
|
||
let (expr, _) = expr.infer_placeholder_types(schema)?; | ||
Ok(expr) | ||
} | ||
|
||
/// Given an unresolved field in an expression, returns a [`Diagnostic`] | ||
fn get_field_not_found_diagnostic( | ||
&self, | ||
expr: &SQLExpr, | ||
unresolved_field: &Column, | ||
schema: &DFSchema, | ||
planner_context: &mut PlannerContext, | ||
) -> Diagnostic { | ||
// Given a SQL expression like SELECT color, megabytes FROM fruit, where | ||
// we assume that the 'megabytes' column doesn't exist in table 'fruit', | ||
// we find that the logical expression Expr::Column(Column('megabytes')) | ||
// is unresolved. Though, because we don't store `sqlparser::Span` in | ||
// `datafusion::Expr`, we have no simple way to find where 'megabytes' | ||
// appears in the query. | ||
// | ||
// Instead, we have to walk down the tree of subexpressions | ||
// `sqlparser::Expr` rooted in the parameter `expr: sqlparser::Expr` to | ||
// find a node that matches the unresolved one. | ||
struct UnresolvedFieldFinder<'a, S: ContextProvider> { | ||
unresolved_field: &'a Column, | ||
sql_to_rel: &'a SqlToRel<'a, S>, | ||
schema: &'a DFSchema, | ||
planner_context: &'a mut PlannerContext, | ||
} | ||
impl<S: ContextProvider> Visitor for UnresolvedFieldFinder<'_, S> { | ||
type Break = Span; | ||
|
||
fn post_visit_expr( | ||
&mut self, | ||
sql_expr: &SQLExpr, | ||
) -> ControlFlow<Self::Break> { | ||
let (logical_expr, span) = match sql_expr { | ||
SQLExpr::Identifier(ident) => ( | ||
self.sql_to_rel | ||
.sql_identifier_to_expr( | ||
ident.clone(), | ||
self.schema, | ||
self.planner_context, | ||
) | ||
.ok(), | ||
sql_expr.span(), | ||
), | ||
SQLExpr::CompoundIdentifier(idents) => ( | ||
self.sql_to_rel | ||
.sql_compound_identifier_to_expr( | ||
idents.clone(), | ||
self.schema, | ||
self.planner_context, | ||
) | ||
.ok(), | ||
sql_expr.span(), | ||
), | ||
_ => (None, Span::empty()), | ||
}; | ||
match logical_expr { | ||
Some(Expr::Column(col)) if &col == self.unresolved_field => { | ||
ControlFlow::Break(span) | ||
} | ||
_ => ControlFlow::Continue(()), | ||
} | ||
} | ||
} | ||
let mut visitor = UnresolvedFieldFinder { | ||
unresolved_field: &unresolved_field, | ||
sql_to_rel: self, | ||
schema, | ||
planner_context, | ||
}; | ||
let span = match expr.visit(&mut visitor) { | ||
ControlFlow::Break(span) => Some(span), | ||
ControlFlow::Continue(_) => None, | ||
}; | ||
|
||
if let Some(relation) = &unresolved_field.relation { | ||
Diagnostic::new().with_error( | ||
format!("column '{}' not found in '{}'", &unresolved_field.name, relation.to_string()), | ||
span.unwrap_or(Span::empty()), | ||
) | ||
} else { | ||
Diagnostic::new().with_error( | ||
format!("column '{}' not found", &unresolved_field.name), | ||
span.unwrap_or(Span::empty()), | ||
) | ||
} | ||
} | ||
|
||
/// Rewrite aliases which are not-complete (e.g. ones that only include only table qualifier in a schema.table qualified relation) | ||
fn rewrite_partial_qualifier(&self, expr: Expr, schema: &DFSchema) -> Expr { | ||
match expr { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is an interesting idea -- one way to think about this is that it adds additional structured information to
DataFusionError::Context
If we went with the
DataFusion::Diagnostic
approach, do you think we would be able to deprecate / removeDataFusionError::Context
in a future release?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thank you! Yes, I think
DataFusion::Diagnostic
can convey a superset of the information thatDataFusion::Context
can. Any wrapping such as:can be converted to:
And of course, we can provide a
DataFusionError::with_simple_diagnostic
function to avoid the boilerplate. At that point,DataFusion::Context
could be removed.This also enables progressively adding
Span
information to what was previously simply a string message.