diff --git a/Cargo.toml b/Cargo.toml index 968a74e37f10..f87205f0d067 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -53,7 +53,7 @@ license = "Apache-2.0" readme = "README.md" repository = "https://github.com/apache/datafusion" rust-version = "1.76" -version = "39.0.0" +version = "40.0.0" [workspace.dependencies] # We turn off default-features for some dependencies here so the workspaces which inherit them can @@ -86,23 +86,23 @@ bytes = "1.4" chrono = { version = "0.4.34", default-features = false } ctor = "0.2.0" dashmap = "5.5.0" -datafusion = { path = "datafusion/core", version = "39.0.0", default-features = false } -datafusion-common = { path = "datafusion/common", version = "39.0.0", default-features = false } -datafusion-common-runtime = { path = "datafusion/common-runtime", version = "39.0.0" } -datafusion-execution = { path = "datafusion/execution", version = "39.0.0" } -datafusion-expr = { path = "datafusion/expr", version = "39.0.0" } -datafusion-functions = { path = "datafusion/functions", version = "39.0.0" } -datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "39.0.0" } -datafusion-functions-array = { path = "datafusion/functions-array", version = "39.0.0" } -datafusion-optimizer = { path = "datafusion/optimizer", version = "39.0.0", default-features = false } -datafusion-physical-expr = { path = "datafusion/physical-expr", version = "39.0.0", default-features = false } -datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "39.0.0", default-features = false } -datafusion-physical-plan = { path = "datafusion/physical-plan", version = "39.0.0" } -datafusion-proto = { path = "datafusion/proto", version = "39.0.0" } -datafusion-proto-common = { path = "datafusion/proto-common", version = "39.0.0" } -datafusion-sql = { path = "datafusion/sql", version = "39.0.0" } -datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "39.0.0" } -datafusion-substrait = { path = "datafusion/substrait", version = "39.0.0" } +datafusion = { path = "datafusion/core", version = "40.0.0", default-features = false } +datafusion-common = { path = "datafusion/common", version = "40.0.0", default-features = false } +datafusion-common-runtime = { path = "datafusion/common-runtime", version = "40.0.0" } +datafusion-execution = { path = "datafusion/execution", version = "40.0.0" } +datafusion-expr = { path = "datafusion/expr", version = "40.0.0" } +datafusion-functions = { path = "datafusion/functions", version = "40.0.0" } +datafusion-functions-aggregate = { path = "datafusion/functions-aggregate", version = "40.0.0" } +datafusion-functions-array = { path = "datafusion/functions-array", version = "40.0.0" } +datafusion-optimizer = { path = "datafusion/optimizer", version = "40.0.0", default-features = false } +datafusion-physical-expr = { path = "datafusion/physical-expr", version = "40.0.0", default-features = false } +datafusion-physical-expr-common = { path = "datafusion/physical-expr-common", version = "40.0.0", default-features = false } +datafusion-physical-plan = { path = "datafusion/physical-plan", version = "40.0.0" } +datafusion-proto = { path = "datafusion/proto", version = "40.0.0" } +datafusion-proto-common = { path = "datafusion/proto-common", version = "40.0.0" } +datafusion-sql = { path = "datafusion/sql", version = "40.0.0" } +datafusion-sqllogictest = { path = "datafusion/sqllogictest", version = "40.0.0" } +datafusion-substrait = { path = "datafusion/substrait", version = "40.0.0" } doc-comment = "0.3" env_logger = "0.11" futures = "0.3" diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 500e731a5b4f..42ec5922a73f 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -381,13 +381,13 @@ dependencies = [ [[package]] name = "async-trait" -version = "0.1.80" +version = "0.1.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c6fa2087f2753a7da8cc1c0dbfcf89579dd57458e36769de5ac750b4671737ca" +checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -875,9 +875,9 @@ dependencies = [ [[package]] name = "cc" -version = "1.0.104" +version = "1.0.106" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "74b6a57f98764a267ff415d50a25e6e166f3831a5071af4995296ea97d210490" +checksum = "066fce287b1d4eafef758e89e09d724a24808a9196fe9756b8ca90e86d0719a2" dependencies = [ "jobserver", "libc", @@ -1099,7 +1099,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "edb49164822f3ee45b17acd4a208cfc1251410cf0cad9a833234c9890774dd9f" dependencies = [ "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -1123,7 +1123,7 @@ dependencies = [ [[package]] name = "datafusion" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "apache-avro", @@ -1177,7 +1177,7 @@ dependencies = [ [[package]] name = "datafusion-cli" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "assert_cmd", @@ -1204,7 +1204,7 @@ dependencies = [ [[package]] name = "datafusion-common" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "apache-avro", @@ -1225,14 +1225,14 @@ dependencies = [ [[package]] name = "datafusion-common-runtime" -version = "39.0.0" +version = "40.0.0" dependencies = [ "tokio", ] [[package]] name = "datafusion-execution" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "chrono", @@ -1251,7 +1251,7 @@ dependencies = [ [[package]] name = "datafusion-expr" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "arrow", @@ -1268,7 +1268,7 @@ dependencies = [ [[package]] name = "datafusion-functions" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "base64 0.22.1", @@ -1292,7 +1292,7 @@ dependencies = [ [[package]] name = "datafusion-functions-aggregate" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "arrow", @@ -1308,7 +1308,7 @@ dependencies = [ [[package]] name = "datafusion-functions-array" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "arrow-array", @@ -1327,7 +1327,7 @@ dependencies = [ [[package]] name = "datafusion-optimizer" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "async-trait", @@ -1345,7 +1345,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "arrow", @@ -1373,7 +1373,7 @@ dependencies = [ [[package]] name = "datafusion-physical-expr-common" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "arrow", @@ -1385,7 +1385,7 @@ dependencies = [ [[package]] name = "datafusion-physical-plan" -version = "39.0.0" +version = "40.0.0" dependencies = [ "ahash", "arrow", @@ -1417,7 +1417,7 @@ dependencies = [ [[package]] name = "datafusion-sql" -version = "39.0.0" +version = "40.0.0" dependencies = [ "arrow", "arrow-array", @@ -1686,7 +1686,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -2008,7 +2008,7 @@ dependencies = [ "http 1.1.0", "hyper 1.4.0", "hyper-util", - "rustls 0.23.10", + "rustls 0.23.11", "rustls-native-certs 0.7.1", "rustls-pki-types", "tokio", @@ -2699,7 +2699,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -2822,7 +2822,7 @@ dependencies = [ "quinn-proto", "quinn-udp", "rustc-hash", - "rustls 0.23.10", + "rustls 0.23.11", "thiserror", "tokio", "tracing", @@ -2838,7 +2838,7 @@ dependencies = [ "rand", "ring 0.17.8", "rustc-hash", - "rustls 0.23.10", + "rustls 0.23.11", "slab", "thiserror", "tinyvec", @@ -2987,7 +2987,7 @@ dependencies = [ "percent-encoding", "pin-project-lite", "quinn", - "rustls 0.23.10", + "rustls 0.23.11", "rustls-native-certs 0.7.1", "rustls-pemfile 2.1.2", "rustls-pki-types", @@ -3117,9 +3117,9 @@ dependencies = [ [[package]] name = "rustls" -version = "0.23.10" +version = "0.23.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05cff451f60db80f490f3c182b77c35260baace73209e9cdbbe526bfe3a4d402" +checksum = "4828ea528154ae444e5a642dbb7d5623354030dc9822b83fd9bb79683c7399d0" dependencies = [ "once_cell", "ring 0.17.8", @@ -3296,22 +3296,22 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" [[package]] name = "serde" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7253ab4de971e72fb7be983802300c30b5a7f0c2e56fab8abfc6a214307c0094" +checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.203" +version = "1.0.204" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "500cbc0ebeb6f46627f50f3f5811ccf6bf00643be300b4c3eabc0ef55dc5b5ba" +checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3446,7 +3446,7 @@ checksum = "01b2e185515564f15375f593fb966b5718bc624ba77fe49fa4616ad619690554" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3492,7 +3492,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3505,7 +3505,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3527,9 +3527,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.68" +version = "2.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "901fa70d88b9d6c98022e23b4136f9f3e54e4662c3bc1bd1d84a42a9a0f0c1e9" +checksum = "2f0209b68b3613b093e0ec905354eccaedcfe83b8cb37cbdeae64026c3064c16" dependencies = [ "proc-macro2", "quote", @@ -3592,7 +3592,7 @@ checksum = "46c3384250002a6d5af4d114f2845d37b57521033f30d5c3f46c4d70e1197533" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3647,9 +3647,9 @@ dependencies = [ [[package]] name = "tinyvec" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ce6b6a2fb3a985e99cebfaefa9faa3024743da73304ca1c683a36429613d3d22" +checksum = "445e881f4f6d382d5f27c034e25eb92edd7c784ceab92a0937db7f2e9471b938" dependencies = [ "tinyvec_macros", ] @@ -3687,7 +3687,7 @@ checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3707,7 +3707,7 @@ version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0c7bc40d0e5a97695bb96e27995cd3a08538541b0a846f65bba7a359f36700d4" dependencies = [ - "rustls 0.23.10", + "rustls 0.23.11", "rustls-pki-types", "tokio", ] @@ -3784,7 +3784,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3829,7 +3829,7 @@ checksum = "f03ca4cb38206e2bef0700092660bb74d696f808514dae47fa1467cbfe26e96e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] @@ -3983,7 +3983,7 @@ dependencies = [ "once_cell", "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", "wasm-bindgen-shared", ] @@ -4017,7 +4017,7 @@ checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", "wasm-bindgen-backend", "wasm-bindgen-shared", ] @@ -4282,7 +4282,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.68", + "syn 2.0.70", ] [[package]] diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index bcacf1d52a9b..860dc123fa94 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -18,7 +18,7 @@ [package] name = "datafusion-cli" description = "Command Line Client for DataFusion query engine." -version = "39.0.0" +version = "40.0.0" authors = ["Apache DataFusion "] edition = "2021" keywords = ["arrow", "datafusion", "query", "sql"] @@ -35,7 +35,7 @@ async-trait = "0.1.41" aws-config = "0.55" aws-credential-types = "0.55" clap = { version = "3", features = ["derive", "cargo"] } -datafusion = { path = "../datafusion/core", version = "39.0.0", features = [ +datafusion = { path = "../datafusion/core", version = "40.0.0", features = [ "avro", "crypto_expressions", "datetime_expressions", diff --git a/datafusion-examples/examples/advanced_parquet_index.rs b/datafusion-examples/examples/advanced_parquet_index.rs index 9bf71e52c3de..c60649ca7c9d 100644 --- a/datafusion-examples/examples/advanced_parquet_index.rs +++ b/datafusion-examples/examples/advanced_parquet_index.rs @@ -42,6 +42,7 @@ use datafusion::physical_optimizer::pruning::PruningPredicate; use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; @@ -299,8 +300,8 @@ impl IndexTableProvider { // In this example, we use the PruningPredicate's literal guarantees to // analyze the predicate. In a real system, using // `PruningPredicate::prune` would likely be easier to do. - let pruning_predicate = - PruningPredicate::try_new(Arc::clone(predicate), self.schema().clone())?; + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); + let pruning_predicate = PruningPredicate::try_new(Arc::clone(predicate), schema)?; // The PruningPredicate's guarantees must all be satisfied in order for // the predicate to possibly evaluate to true. @@ -453,8 +454,8 @@ impl TableProvider for IndexTableProvider { self } - fn schema(&self) -> SchemaRef { - Arc::clone(&self.indexed_file.schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.indexed_file.schema.as_ref().clone().into()) } fn table_type(&self) -> TableType { @@ -482,7 +483,7 @@ impl TableProvider for IndexTableProvider { .with_extensions(Arc::new(access_plan) as _); // Prepare for scanning - let schema = self.schema(); + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); let object_store_url = ObjectStoreUrl::parse("file://")?; let file_scan_config = FileScanConfig::new(object_store_url, schema) .with_limit(limit) diff --git a/datafusion-examples/examples/custom_datasource.rs b/datafusion-examples/examples/custom_datasource.rs index cfb49b023159..be3dd70de3d1 100644 --- a/datafusion-examples/examples/custom_datasource.rs +++ b/datafusion-examples/examples/custom_datasource.rs @@ -22,7 +22,7 @@ use std::sync::{Arc, Mutex}; use std::time::Duration; use datafusion::arrow::array::{UInt64Builder, UInt8Builder}; -use datafusion::arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use datafusion::arrow::datatypes::{DataType, SchemaRef}; use datafusion::arrow::record_batch::RecordBatch; use datafusion::datasource::{provider_as_source, TableProvider, TableType}; use datafusion::error::Result; @@ -37,6 +37,8 @@ use datafusion_expr::LogicalPlanBuilder; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; use tokio::time::timeout; /// This example demonstrates executing a simple query against a custom datasource @@ -162,10 +164,10 @@ impl TableProvider for CustomDataSource { self } - fn schema(&self) -> SchemaRef { - SchemaRef::new(Schema::new(vec![ - Field::new("id", DataType::UInt8, false), - Field::new("bank_account", DataType::UInt64, true), + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("id", DataType::UInt8, false), + LogicalPhysicalField::new("bank_account", DataType::UInt64, true), ])) } @@ -181,7 +183,8 @@ impl TableProvider for CustomDataSource { _filters: &[Expr], _limit: Option, ) -> Result> { - return self.create_physical_plan(projection, self.schema()).await; + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); + return self.create_physical_plan(projection, schema).await; } } diff --git a/datafusion-examples/examples/dataframe_subquery.rs b/datafusion-examples/examples/dataframe_subquery.rs index e798751b3353..7a7f3083742c 100644 --- a/datafusion-examples/examples/dataframe_subquery.rs +++ b/datafusion-examples/examples/dataframe_subquery.rs @@ -15,9 +15,9 @@ // specific language governing permissions and limitations // under the License. -use arrow_schema::DataType; use std::sync::Arc; +use arrow_schema::DataType; use datafusion::error::Result; use datafusion::functions_aggregate::average::avg; use datafusion::prelude::*; diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 43729a913e5d..16ec02515b07 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -28,6 +28,8 @@ use datafusion::functions_aggregate::first_last::first_value_udaf; use datafusion::optimizer::simplify_expressions::ExprSimplifier; use datafusion::physical_expr::{analyze, AnalysisContext, ExprBoundaries}; use datafusion::prelude::*; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use datafusion_common::{ScalarValue, ToDFSchema}; use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::expr::BinaryExpr; @@ -156,7 +158,8 @@ fn simplify_demo() -> Result<()> { // However, DataFusion's simplification logic can do this for you // you need to tell DataFusion the type of column "ts": - let schema = Schema::new(vec![make_ts_field("ts")]).to_dfschema_ref()?; + let schema = + LogicalPhysicalSchema::from(Schema::new(vec![make_ts_field("ts")])).to_dfschema_ref()?; // And then build a simplifier // the ExecutionProps carries information needed to simplify @@ -177,10 +180,10 @@ fn simplify_demo() -> Result<()> { ); // here are some other examples of what DataFusion is capable of - let schema = Schema::new(vec![ + let schema = LogicalPhysicalSchema::from(Schema::new(vec![ make_field("i", DataType::Int64), make_field("b", DataType::Boolean), - ]) + ])) .to_dfschema_ref()?; let context = SimplifyContext::new(&props).with_schema(schema.clone()); let simplifier = ExprSimplifier::new(context); @@ -211,7 +214,8 @@ fn simplify_demo() -> Result<()> { // String --> Date simplification // `cast('2020-09-01' as date)` --> 18500 assert_eq!( - simplifier.simplify(lit("2020-09-01").cast_to(&DataType::Date32, &schema)?)?, + simplifier + .simplify(lit("2020-09-01").cast_to(&DataType::Date32.into(), &schema)?)?, lit(ScalarValue::Date32(Some(18506))) ); @@ -258,7 +262,7 @@ fn range_analysis_demo() -> Result<()> { let analysis_result = analyze( &physical_expr, AnalysisContext::new(boundaries), - df_schema.as_ref(), + &df_schema.into(), )?; // The results of the analysis is an range, encoded as an `Interval`, for @@ -293,14 +297,14 @@ fn expression_type_demo() -> Result<()> { // a schema. In this case we create a schema where the column `c` is of // type Utf8 (a String / VARCHAR) let schema = DFSchema::from_unqualified_fields( - vec![Field::new("c", DataType::Utf8, true)].into(), + vec![LogicalPhysicalField::new("c", DataType::Utf8, true)].into(), HashMap::new(), )?; assert_eq!("Utf8", format!("{}", expr.get_type(&schema).unwrap())); // Using a schema where the column `foo` is of type Int32 let schema = DFSchema::from_unqualified_fields( - vec![Field::new("c", DataType::Int32, true)].into(), + vec![LogicalPhysicalField::new("c", DataType::Int32, true)].into(), HashMap::new(), )?; assert_eq!("Int32", format!("{}", expr.get_type(&schema).unwrap())); @@ -310,8 +314,8 @@ fn expression_type_demo() -> Result<()> { let expr = col("c1") + col("c2"); let schema = DFSchema::from_unqualified_fields( vec![ - Field::new("c1", DataType::Int32, true), - Field::new("c2", DataType::Float32, true), + LogicalPhysicalField::new("c1", DataType::Int32, true), + LogicalPhysicalField::new("c2", DataType::Float32, true), ] .into(), HashMap::new(), diff --git a/datafusion-examples/examples/function_factory.rs b/datafusion-examples/examples/function_factory.rs index f57b3bf60404..e187cfab3859 100644 --- a/datafusion-examples/examples/function_factory.rs +++ b/datafusion-examples/examples/function_factory.rs @@ -22,6 +22,7 @@ use datafusion::error::Result; use datafusion::execution::context::{ FunctionFactory, RegisterFunction, SessionContext, SessionState, }; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{exec_err, internal_err, DataFusionError}; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; @@ -216,13 +217,15 @@ impl TryFrom for ScalarFunctionWrapper { .expect("Expression has to be defined!"), return_type: definition .return_type - .expect("Return type has to be defined!"), + .expect("Return type has to be defined!") + .physical() + .clone(), signature: Signature::exact( definition .args .unwrap_or_default() .into_iter() - .map(|a| a.data_type) + .map(|a| a.data_type.physical().clone()) .collect(), definition .params diff --git a/datafusion-examples/examples/logical_type.rs b/datafusion-examples/examples/logical_type.rs new file mode 100644 index 000000000000..19f6e445f9b5 --- /dev/null +++ b/datafusion-examples/examples/logical_type.rs @@ -0,0 +1,105 @@ +use arrow::util::pretty::pretty_format_batches; +use arrow_schema::{DataType, Field, TimeUnit}; +use datafusion::datasource::TableProvider; +use datafusion::error::Result; +use datafusion::execution::context::SessionState; +use datafusion::physical_plan::ExecutionPlan; +use datafusion::prelude::SessionContext; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, TypeRelationRef}; +use datafusion_expr::{Expr, TableType}; +use std::any::Any; +use std::sync::Arc; + +#[tokio::main] +async fn main() -> Result<()> { + let ctx = SessionContext::new(); + ctx.register_table("example", Arc::new(ExampleTableSource::default()))?; + + let df = ctx.sql("SELECT * FROM example").await?; + let records = df.collect().await?; + + println!("{}", pretty_format_batches(&records)?); + + Ok(()) +} + +#[derive(Debug)] +struct CustomMagicalType { + logical: LogicalType, + physical: DataType, +} + +impl Default for CustomMagicalType { + fn default() -> Self { + Self { + logical: LogicalType::Utf8, + physical: DataType::new_list(DataType::UInt8, false), + } + } +} + +impl TypeRelation for CustomMagicalType { + fn logical(&self) -> &LogicalType { + &self.logical + } + + fn physical(&self) -> &DataType { + &self.physical + } + + // TODO: materialisation methods? +} + +#[derive(Default)] +struct ExampleTableSource {} + +#[async_trait::async_trait] +impl TableProvider for ExampleTableSource { + fn as_any(&self) -> &dyn Any { + self + } + + fn schema(&self) -> LogicalPhysicalSchemaRef { + // TODO: ugly? + let custom_magical_type: TypeRelationRef = + Arc::new(CustomMagicalType::default()); + + // This schema will be equivalent to: + // a -> Timestamp(Microsecond, None) + // b -> Utf8 + // c -> Int64 + Arc::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new( + "a", + DataType::RunEndEncoded( + Arc::new(Field::new("run_ends", DataType::Int64, false)), + Arc::new(Field::new( + "values", + DataType::Timestamp(TimeUnit::Microsecond, None), + false, + )), + ), + false, + ), + LogicalPhysicalField::new("b", custom_magical_type, false), + LogicalPhysicalField::new("c", DataType::Int64, true), + ])) + } + + fn table_type(&self) -> TableType { + TableType::Base + } + + async fn scan( + &self, + _state: &SessionState, + _projection: Option<&Vec>, + _filters: &[Expr], + _limit: Option, + ) -> Result> { + todo!() + } +} diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index 668eda047444..884392972bae 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -37,6 +37,7 @@ use datafusion::parquet::arrow::{ use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::*; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{ internal_datafusion_err, DFSchema, DataFusionError, Result, ScalarValue, }; @@ -212,8 +213,8 @@ impl TableProvider for IndexTableProvider { self } - fn schema(&self) -> SchemaRef { - self.index.schema().clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.index.schema().into()) } fn table_type(&self) -> TableType { @@ -243,7 +244,8 @@ impl TableProvider for IndexTableProvider { let files = self.index.get_files(predicate.clone())?; let object_store_url = ObjectStoreUrl::parse("file://")?; - let mut file_scan_config = FileScanConfig::new(object_store_url, self.schema()) + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); + let mut file_scan_config = FileScanConfig::new(object_store_url, schema) .with_projection(projection.cloned()) .with_limit(limit); diff --git a/datafusion-examples/examples/simple_udtf.rs b/datafusion-examples/examples/simple_udtf.rs index c68c21fab169..75bcce48eb85 100644 --- a/datafusion-examples/examples/simple_udtf.rs +++ b/datafusion-examples/examples/simple_udtf.rs @@ -27,6 +27,7 @@ use datafusion::execution::context::{ExecutionProps, SessionState}; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::ExecutionPlan; use datafusion::prelude::SessionContext; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{plan_err, ScalarValue}; use datafusion_expr::simplify::SimplifyContext; use datafusion_expr::{Expr, TableType}; @@ -35,7 +36,6 @@ use std::fs::File; use std::io::Seek; use std::path::Path; use std::sync::Arc; - // To define your own table function, you only need to do the following 3 things: // 1. Implement your own [`TableProvider`] // 2. Implement your own [`TableFunctionImpl`] and return your [`TableProvider`] @@ -85,8 +85,8 @@ impl TableProvider for LocalCsvTable { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { @@ -121,7 +121,7 @@ impl TableProvider for LocalCsvTable { }; Ok(Arc::new(MemoryExec::try_new( &[batches], - TableProvider::schema(self), + self.schema.clone(), projection.cloned(), )?)) } diff --git a/datafusion-examples/examples/sql_frontend.rs b/datafusion-examples/examples/sql_frontend.rs index 839ee95eb181..b32261774f86 100644 --- a/datafusion-examples/examples/sql_frontend.rs +++ b/datafusion-examples/examples/sql_frontend.rs @@ -15,8 +15,11 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::DataType; use datafusion_common::config::ConfigOptions; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{plan_err, Result}; use datafusion_expr::{ AggregateUDF, Expr, LogicalPlan, ScalarUDF, TableProviderFilterPushDown, TableSource, @@ -139,9 +142,9 @@ impl ContextProvider for MyContextProvider { fn get_table_source(&self, name: TableReference) -> Result> { if name.table() == "person" { Ok(Arc::new(MyTableSource { - schema: Arc::new(Schema::new(vec![ - Field::new("name", DataType::Utf8, false), - Field::new("age", DataType::UInt8, false), + schema: Arc::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("name", DataType::Utf8, false), + LogicalPhysicalField::new("age", DataType::UInt8, false), ])), })) } else { @@ -157,7 +160,7 @@ impl ContextProvider for MyContextProvider { None } - fn get_variable_type(&self, _variable_names: &[String]) -> Option { + fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } @@ -184,7 +187,7 @@ impl ContextProvider for MyContextProvider { /// TableSource is the part of TableProvider needed for creating a LogicalPlan. struct MyTableSource { - schema: SchemaRef, + schema: LogicalPhysicalSchemaRef, } impl TableSource for MyTableSource { @@ -192,7 +195,7 @@ impl TableSource for MyTableSource { self } - fn schema(&self) -> SchemaRef { + fn schema(&self) -> LogicalPhysicalSchemaRef { self.schema.clone() } diff --git a/datafusion/common/src/column.rs b/datafusion/common/src/column.rs index e36a4f890644..24b24db887fd 100644 --- a/datafusion/common/src/column.rs +++ b/datafusion/common/src/column.rs @@ -17,9 +17,8 @@ //! Column -use arrow_schema::{Field, FieldRef}; - use crate::error::_schema_err; +use crate::logical_type::field::{LogicalPhysicalField, LogicalPhysicalFieldRef}; use crate::utils::{parse_identifiers_normalized, quote_identifier}; use crate::{DFSchema, DataFusionError, Result, SchemaError, TableReference}; use std::collections::HashSet; @@ -349,15 +348,15 @@ impl From for Column { } /// Create a column, use qualifier and field name -impl From<(Option<&TableReference>, &Field)> for Column { - fn from((relation, field): (Option<&TableReference>, &Field)) -> Self { +impl From<(Option<&TableReference>, &LogicalPhysicalField)> for Column { + fn from((relation, field): (Option<&TableReference>, &LogicalPhysicalField)) -> Self { Self::new(relation.cloned(), field.name()) } } /// Create a column, use qualifier and field name -impl From<(Option<&TableReference>, &FieldRef)> for Column { - fn from((relation, field): (Option<&TableReference>, &FieldRef)) -> Self { +impl From<(Option<&TableReference>, &LogicalPhysicalFieldRef)> for Column { + fn from((relation, field): (Option<&TableReference>, &LogicalPhysicalFieldRef)) -> Self { Self::new(relation.cloned(), field.name()) } } @@ -380,7 +379,7 @@ impl fmt::Display for Column { mod tests { use super::*; use arrow::datatypes::DataType; - use arrow_schema::SchemaBuilder; + use arrow_schema::{Field, SchemaBuilder}; fn create_qualified_schema(qualifier: &str, names: Vec<&str>) -> Result { let mut schema_builder = SchemaBuilder::new(); @@ -389,7 +388,7 @@ mod tests { .iter() .map(|f| Field::new(*f, DataType::Boolean, true)), ); - let schema = Arc::new(schema_builder.finish()); + let schema = Arc::new(schema_builder.finish().into()); DFSchema::try_from_qualified_schema(qualifier, &schema) } diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 3c2cc89fc014..9f82bc3dee57 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -29,9 +29,14 @@ use crate::{ SchemaError, TableReference, }; +use crate::logical_type::field::{LogicalPhysicalField, LogicalPhysicalFieldRef}; +use crate::logical_type::fields::LogicalPhysicalFields; +use crate::logical_type::schema::{ + LogicalPhysicalSchema, LogicalPhysicalSchemaBuilder, LogicalPhysicalSchemaRef, +}; +use crate::logical_type::{TypeRelation, LogicalPhysicalType}; use arrow::compute::can_cast_types; -use arrow::datatypes::{DataType, Field, FieldRef, Fields, Schema, SchemaRef}; -use arrow_schema::SchemaBuilder; +use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; /// A reference-counted reference to a [DFSchema]. pub type DFSchemaRef = Arc; @@ -62,7 +67,7 @@ pub type DFSchemaRef = Arc; /// Field::new("c1", DataType::Int32, false), /// ]); /// -/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema).unwrap(); +/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema.into()).unwrap(); /// let column = Column::from_qualified_name("t1.c1"); /// assert!(df_schema.has_column(&column)); /// @@ -107,9 +112,9 @@ pub type DFSchemaRef = Arc; #[derive(Debug, Clone, PartialEq, Eq)] pub struct DFSchema { /// Inner Arrow schema reference. - inner: SchemaRef, + inner: LogicalPhysicalSchemaRef, /// Optional qualifiers for each column in this schema. In the same order as - /// the `self.inner.fields()` + /// the `self.inner.fields` field_qualifiers: Vec>, /// Stores functional dependencies in the schema. functional_dependencies: FunctionalDependencies, @@ -119,7 +124,7 @@ impl DFSchema { /// Creates an empty `DFSchema` pub fn empty() -> Self { Self { - inner: Arc::new(Schema::new([])), + inner: Arc::new(LogicalPhysicalSchema::new([])), field_qualifiers: vec![], functional_dependencies: FunctionalDependencies::empty(), } @@ -128,26 +133,26 @@ impl DFSchema { /// Return a reference to the inner Arrow [`Schema`] /// /// Note this does not have the qualifier information - pub fn as_arrow(&self) -> &Schema { + pub fn as_arrow(&self) -> &LogicalPhysicalSchema { self.inner.as_ref() } /// Return a reference to the inner Arrow [`SchemaRef`] /// /// Note this does not have the qualifier information - pub fn inner(&self) -> &SchemaRef { + pub fn inner(&self) -> &LogicalPhysicalSchemaRef { &self.inner } /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn new_with_metadata( - qualified_fields: Vec<(Option, Arc)>, + qualified_fields: Vec<(Option, Arc)>, metadata: HashMap, ) -> Result { - let (qualifiers, fields): (Vec>, Vec>) = + let (qualifiers, fields): (Vec>, Vec>) = qualified_fields.into_iter().unzip(); - let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + let schema = Arc::new(LogicalPhysicalSchema::new_with_metadata(fields, metadata)); let dfschema = Self { inner: schema, @@ -161,7 +166,7 @@ impl DFSchema { /// Create a new `DFSchema` from a list of Arrow [Field]s #[allow(deprecated)] pub fn from_unqualified_fields( - fields: Fields, + fields: LogicalPhysicalFields, metadata: HashMap, ) -> Result { Self::from_unqualifed_fields(fields, metadata) @@ -173,11 +178,11 @@ impl DFSchema { note = "Please use `from_unqualified_fields` instead (this one's name is a typo). This method is subject to be removed soon" )] pub fn from_unqualifed_fields( - fields: Fields, + fields: LogicalPhysicalFields, metadata: HashMap, ) -> Result { let field_count = fields.len(); - let schema = Arc::new(Schema::new_with_metadata(fields, metadata)); + let schema = Arc::new(LogicalPhysicalSchema::new_with_metadata(fields, metadata)); let dfschema = Self { inner: schema, field_qualifiers: vec![None; field_count], @@ -193,7 +198,7 @@ impl DFSchema { /// `DFSchema::try_from`. pub fn try_from_qualified_schema( qualifier: impl Into, - schema: &Schema, + schema: &LogicalPhysicalSchema, ) -> Result { let qualifier = qualifier.into(); let schema = DFSchema { @@ -208,7 +213,7 @@ impl DFSchema { /// Create a `DFSchema` from an Arrow schema where all the fields have a given qualifier pub fn from_field_specific_qualified_schema( qualifiers: Vec>, - schema: &SchemaRef, + schema: &LogicalPhysicalSchemaRef, ) -> Result { let dfschema = Self { inner: schema.clone(), @@ -224,7 +229,7 @@ impl DFSchema { let mut qualified_names = BTreeSet::new(); let mut unqualified_names = BTreeSet::new(); - for (field, qualifier) in self.inner.fields().iter().zip(&self.field_qualifiers) { + for (field, qualifier) in self.inner.fields.iter().zip(&self.field_qualifiers) { if let Some(qualifier) = qualifier { qualified_names.insert((qualifier, field.name())); } else if !unqualified_names.insert(field.name()) { @@ -263,8 +268,8 @@ impl DFSchema { /// Create a new schema that contains the fields from this schema followed by the fields /// from the supplied schema. An error will be returned if there are duplicate field names. pub fn join(&self, schema: &DFSchema) -> Result { - let mut schema_builder = SchemaBuilder::new(); - schema_builder.extend(self.inner.fields().iter().cloned()); + let mut schema_builder = LogicalPhysicalSchemaBuilder::new(); + schema_builder.extend(self.inner.fields.iter().cloned()); schema_builder.extend(schema.fields().iter().cloned()); let new_schema = schema_builder.finish(); @@ -291,23 +296,19 @@ impl DFSchema { return; } - let self_fields: HashSet<(Option<&TableReference>, &FieldRef)> = + let self_fields: HashSet<(Option<&TableReference>, &LogicalPhysicalFieldRef)> = self.iter().collect(); - let self_unqualified_names: HashSet<&str> = self - .inner - .fields - .iter() - .map(|field| field.name().as_str()) - .collect(); + let self_unqualified_names: HashSet<&str> = + self.inner.fields.iter().map(|field| field.name()).collect(); - let mut schema_builder = SchemaBuilder::from(self.inner.fields.clone()); + let mut schema_builder = LogicalPhysicalSchemaBuilder::from(self.inner.fields.clone()); let mut qualifiers = Vec::new(); for (qualifier, field) in other_schema.iter() { // skip duplicate columns let duplicated_field = match qualifier { Some(q) => self_fields.contains(&(Some(q), field)), // for unqualified columns, check as unqualified name - None => self_unqualified_names.contains(field.name().as_str()), + None => self_unqualified_names.contains(field.name()), }; if !duplicated_field { // self.inner.fields.push(field.clone()); @@ -325,19 +326,19 @@ impl DFSchema { } /// Get a list of fields - pub fn fields(&self) -> &Fields { + pub fn fields(&self) -> &LogicalPhysicalFields { &self.inner.fields } /// Returns an immutable reference of a specific `Field` instance selected using an /// offset within the internal `fields` vector - pub fn field(&self, i: usize) -> &Field { + pub fn field(&self, i: usize) -> &LogicalPhysicalField { &self.inner.fields[i] } /// Returns an immutable reference of a specific `Field` instance selected using an /// offset within the internal `fields` vector and its qualifier - pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &Field) { + pub fn qualified_field(&self, i: usize) -> (Option<&TableReference>, &LogicalPhysicalField) { (self.field_qualifiers[i].as_ref(), self.field(i)) } @@ -404,7 +405,7 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<&Field> { + ) -> Result<&LogicalPhysicalField> { if let Some(qualifier) = qualifier { self.field_with_qualified_name(qualifier, name) } else { @@ -417,7 +418,7 @@ impl DFSchema { &self, qualifier: Option<&TableReference>, name: &str, - ) -> Result<(Option<&TableReference>, &Field)> { + ) -> Result<(Option<&TableReference>, &LogicalPhysicalField)> { if let Some(qualifier) = qualifier { let idx = self .index_of_column_by_name(Some(qualifier), name) @@ -429,7 +430,10 @@ impl DFSchema { } /// Find all fields having the given qualifier - pub fn fields_with_qualified(&self, qualifier: &TableReference) -> Vec<&Field> { + pub fn fields_with_qualified( + &self, + qualifier: &TableReference, + ) -> Vec<&LogicalPhysicalField> { self.iter() .filter(|(q, _)| q.map(|q| q.eq(qualifier)).unwrap_or(false)) .map(|(_, f)| f.as_ref()) @@ -448,7 +452,7 @@ impl DFSchema { } /// Find all fields that match the given name - pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&Field> { + pub fn fields_with_unqualified_name(&self, name: &str) -> Vec<&LogicalPhysicalField> { self.fields() .iter() .filter(|field| field.name() == name) @@ -460,7 +464,7 @@ impl DFSchema { pub fn qualified_fields_with_unqualified_name( &self, name: &str, - ) -> Vec<(Option<&TableReference>, &Field)> { + ) -> Vec<(Option<&TableReference>, &LogicalPhysicalField)> { self.iter() .filter(|(_, field)| field.name() == name) .map(|(qualifier, field)| (qualifier, field.as_ref())) @@ -478,9 +482,7 @@ impl DFSchema { /// Return all `Column`s for the schema pub fn columns(&self) -> Vec { self.iter() - .map(|(qualifier, field)| { - Column::new(qualifier.cloned(), field.name().clone()) - }) + .map(|(qualifier, field)| Column::new(qualifier.cloned(), field.name())) .collect() } @@ -488,7 +490,7 @@ impl DFSchema { pub fn qualified_field_with_unqualified_name( &self, name: &str, - ) -> Result<(Option<&TableReference>, &Field)> { + ) -> Result<(Option<&TableReference>, &LogicalPhysicalField)> { let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), @@ -520,7 +522,7 @@ impl DFSchema { } /// Find the field with the given name - pub fn field_with_unqualified_name(&self, name: &str) -> Result<&Field> { + pub fn field_with_unqualified_name(&self, name: &str) -> Result<&LogicalPhysicalField> { let matches = self.qualified_fields_with_unqualified_name(name); match matches.len() { 0 => Err(unqualified_field_not_found(name, self)), @@ -556,7 +558,7 @@ impl DFSchema { &self, qualifier: &TableReference, name: &str, - ) -> Result<&Field> { + ) -> Result<&LogicalPhysicalField> { let idx = self .index_of_column_by_name(Some(qualifier), name) .ok_or_else(|| field_not_found(Some(qualifier.clone()), name, self))?; @@ -565,7 +567,7 @@ impl DFSchema { } /// Find the field with the given qualified column - pub fn field_from_column(&self, column: &Column) -> Result<&Field> { + pub fn field_from_column(&self, column: &Column) -> Result<&LogicalPhysicalField> { match &column.relation { Some(r) => self.field_with_qualified_name(r, &column.name), None => self.field_with_unqualified_name(&column.name), @@ -576,7 +578,7 @@ impl DFSchema { pub fn qualified_field_from_column( &self, column: &Column, - ) -> Result<(Option<&TableReference>, &Field)> { + ) -> Result<(Option<&TableReference>, &LogicalPhysicalField)> { self.qualified_field_with_name(column.relation.as_ref(), &column.name) } @@ -671,7 +673,10 @@ impl DFSchema { self_fields.zip(other_fields).all(|((q1, f1), (q2, f2))| { q1 == q2 && f1.name() == f2.name() - && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) + && Self::datatype_is_semantically_equal( + &f1.data_type().physical(), + &f2.data_type().physical(), + ) }) } @@ -679,40 +684,8 @@ impl DFSchema { /// than datatype_is_semantically_equal in that a Dictionary type is logically /// equal to a plain V type, but not semantically equal. Dictionary is also /// logically equal to Dictionary. - pub fn datatype_is_logically_equal(dt1: &DataType, dt2: &DataType) -> bool { - // check nested fields - match (dt1, dt2) { - (DataType::Dictionary(_, v1), DataType::Dictionary(_, v2)) => { - v1.as_ref() == v2.as_ref() - } - (DataType::Dictionary(_, v1), othertype) => v1.as_ref() == othertype, - (othertype, DataType::Dictionary(_, v1)) => v1.as_ref() == othertype, - (DataType::List(f1), DataType::List(f2)) - | (DataType::LargeList(f1), DataType::LargeList(f2)) - | (DataType::FixedSizeList(f1, _), DataType::FixedSizeList(f2, _)) - | (DataType::Map(f1, _), DataType::Map(f2, _)) => { - Self::field_is_logically_equal(f1, f2) - } - (DataType::Struct(fields1), DataType::Struct(fields2)) => { - let iter1 = fields1.iter(); - let iter2 = fields2.iter(); - fields1.len() == fields2.len() && - // all fields have to be the same - iter1 - .zip(iter2) - .all(|(f1, f2)| Self::field_is_logically_equal(f1, f2)) - } - (DataType::Union(fields1, _), DataType::Union(fields2, _)) => { - let iter1 = fields1.iter(); - let iter2 = fields2.iter(); - fields1.len() == fields2.len() && - // all fields have to be the same - iter1 - .zip(iter2) - .all(|((t1, f1), (t2, f2))| t1 == t2 && Self::field_is_logically_equal(f1, f2)) - } - _ => dt1 == dt2, - } + pub fn datatype_is_logically_equal(dt1: &LogicalPhysicalType, dt2: &LogicalPhysicalType) -> bool { + dt1 == dt2 } /// Returns true of two [`DataType`]s are semantically equal (same @@ -762,11 +735,6 @@ impl DFSchema { } } - fn field_is_logically_equal(f1: &Field, f2: &Field) -> bool { - f1.name() == f2.name() - && Self::datatype_is_logically_equal(f1.data_type(), f2.data_type()) - } - fn field_is_semantically_equal(f1: &Field, f2: &Field) -> bool { f1.name() == f2.name() && Self::datatype_is_semantically_equal(f1.data_type(), f2.data_type()) @@ -809,10 +777,12 @@ impl DFSchema { } /// Iterate over the qualifiers and fields in the DFSchema - pub fn iter(&self) -> impl Iterator, &FieldRef)> { + pub fn iter( + &self, + ) -> impl Iterator, &LogicalPhysicalFieldRef)> { self.field_qualifiers .iter() - .zip(self.inner.fields().iter()) + .zip(self.inner.fields.iter()) .map(|(qualifier, field)| (qualifier.as_ref(), field)) } } @@ -820,7 +790,7 @@ impl DFSchema { impl From for Schema { /// Convert DFSchema into a Schema fn from(df_schema: DFSchema) -> Self { - let fields: Fields = df_schema.inner.fields.clone(); + let fields: Fields = df_schema.inner.fields.clone().into(); Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } @@ -828,23 +798,29 @@ impl From for Schema { impl From<&DFSchema> for Schema { /// Convert DFSchema reference into a Schema fn from(df_schema: &DFSchema) -> Self { - let fields: Fields = df_schema.inner.fields.clone(); + let fields: Fields = df_schema.inner.fields.clone().into(); Schema::new_with_metadata(fields, df_schema.inner.metadata.clone()) } } -/// Allow DFSchema to be converted into an Arrow `&Schema` -impl AsRef for DFSchema { - fn as_ref(&self) -> &Schema { - self.as_arrow() +/// Create a `DFSchema` from an Arrow schema +impl TryFrom for DFSchema { + type Error = DataFusionError; + fn try_from(schema: LogicalPhysicalSchema) -> Result { + Self::try_from(Arc::new(schema)) } } -/// Allow DFSchema to be converted into an Arrow `&SchemaRef` (to clone, for -/// example) -impl AsRef for DFSchema { - fn as_ref(&self) -> &SchemaRef { - self.inner() +impl TryFrom for DFSchema { + type Error = DataFusionError; + fn try_from(schema: LogicalPhysicalSchemaRef) -> Result { + let field_count = schema.fields.len(); + let dfschema = Self { + inner: schema, + field_qualifiers: vec![None; field_count], + functional_dependencies: FunctionalDependencies::empty(), + }; + Ok(dfschema) } } @@ -852,20 +828,14 @@ impl AsRef for DFSchema { impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: Schema) -> Result { - Self::try_from(Arc::new(schema)) + Self::try_from(LogicalPhysicalSchema::from(schema)) } } impl TryFrom for DFSchema { type Error = DataFusionError; fn try_from(schema: SchemaRef) -> Result { - let field_count = schema.fields.len(); - let dfschema = Self { - inner: schema, - field_qualifiers: vec![None; field_count], - functional_dependencies: FunctionalDependencies::empty(), - }; - Ok(dfschema) + Self::try_from(schema.as_ref().clone()) } } @@ -897,22 +867,22 @@ where } } -impl ToDFSchema for Schema { +impl ToDFSchema for LogicalPhysicalSchema { fn to_dfschema(self) -> Result { DFSchema::try_from(self) } } -impl ToDFSchema for SchemaRef { +impl ToDFSchema for LogicalPhysicalSchemaRef { fn to_dfschema(self) -> Result { DFSchema::try_from(self) } } -impl ToDFSchema for Vec { +impl ToDFSchema for Vec { fn to_dfschema(self) -> Result { let field_count = self.len(); - let schema = Schema { + let schema = LogicalPhysicalSchema { fields: self.into(), metadata: HashMap::new(), }; @@ -949,13 +919,13 @@ pub trait ExprSchema: std::fmt::Debug { fn nullable(&self, col: &Column) -> Result; /// What is the datatype of this column? - fn data_type(&self, col: &Column) -> Result<&DataType>; + fn data_type(&self, col: &Column) -> Result<&LogicalPhysicalType>; /// Returns the column's optional metadata. fn metadata(&self, col: &Column) -> Result<&HashMap>; /// Return the coulmn's datatype and nullability - fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)>; + fn data_type_and_nullable(&self, col: &Column) -> Result<(&LogicalPhysicalType, bool)>; } // Implement `ExprSchema` for `Arc` @@ -964,7 +934,7 @@ impl + std::fmt::Debug> ExprSchema for P { self.as_ref().nullable(col) } - fn data_type(&self, col: &Column) -> Result<&DataType> { + fn data_type(&self, col: &Column) -> Result<&LogicalPhysicalType> { self.as_ref().data_type(col) } @@ -972,7 +942,7 @@ impl + std::fmt::Debug> ExprSchema for P { ExprSchema::metadata(self.as_ref(), col) } - fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)> { + fn data_type_and_nullable(&self, col: &Column) -> Result<(&LogicalPhysicalType, bool)> { self.as_ref().data_type_and_nullable(col) } } @@ -982,7 +952,7 @@ impl ExprSchema for DFSchema { Ok(self.field_from_column(col)?.is_nullable()) } - fn data_type(&self, col: &Column) -> Result<&DataType> { + fn data_type(&self, col: &Column) -> Result<&LogicalPhysicalType> { Ok(self.field_from_column(col)?.data_type()) } @@ -990,7 +960,7 @@ impl ExprSchema for DFSchema { Ok(self.field_from_column(col)?.metadata()) } - fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)> { + fn data_type_and_nullable(&self, col: &Column) -> Result<(&LogicalPhysicalType, bool)> { let field = self.field_from_column(col)?; Ok((field.data_type(), field.is_nullable())) } @@ -1041,8 +1011,8 @@ impl SchemaExt for Schema { .all(|(f1, f2)| { f1.name() == f2.name() && DFSchema::datatype_is_logically_equal( - f1.data_type(), - f2.data_type(), + &f1.data_type().into(), + &f2.data_type().into(), ) }) } @@ -1082,7 +1052,8 @@ mod tests { &Schema::new(vec![ Field::new("CapitalColumn", DataType::Boolean, true), Field::new("field.with.period", DataType::Boolean, true), - ]), + ]) + .into(), )?; // lookup with unqualified name "t1.c0" @@ -1112,9 +1083,9 @@ mod tests { fn test_from_field_specific_qualified_schema() -> Result<()> { let schema = DFSchema::from_field_specific_qualified_schema( vec![Some("t1".into()), None], - &Arc::new(Schema::new(vec![ - Field::new("c0", DataType::Boolean, true), - Field::new("c1", DataType::Boolean, true), + &Arc::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("c0", DataType::Boolean, true), + LogicalPhysicalField::new("c1", DataType::Boolean, true), ])), )?; assert_eq!("fields:[t1.c0, c1], metadata:{}", schema.to_string()); @@ -1127,9 +1098,12 @@ mod tests { vec![ ( Some("t0".into()), - Arc::new(Field::new("c0", DataType::Boolean, true)), + Arc::new(Field::new("c0", DataType::Boolean, true).into()), + ), + ( + None, + Arc::new(Field::new("c1", DataType::Boolean, true).into()), ), - (None, Arc::new(Field::new("c1", DataType::Boolean, true))), ], HashMap::new(), )?; @@ -1273,41 +1247,43 @@ mod tests { vec![Field::new("c0", DataType::Int64, true)], metadata.clone(), ); - let arrow_schema_ref = Arc::new(arrow_schema.clone()); + let logical_schema = LogicalPhysicalSchema::from(arrow_schema); + let logical_schema_ref = Arc::new(logical_schema.clone()); let df_schema = DFSchema { - inner: arrow_schema_ref.clone(), - field_qualifiers: vec![None; arrow_schema_ref.fields.len()], + inner: logical_schema_ref.clone(), + field_qualifiers: vec![None; logical_schema_ref.fields.len()], functional_dependencies: FunctionalDependencies::empty(), }; let df_schema_ref = Arc::new(df_schema.clone()); { - let arrow_schema = arrow_schema.clone(); - let arrow_schema_ref = arrow_schema_ref.clone(); + let logical_schema = logical_schema.clone(); + let logical_schema_ref = logical_schema_ref.clone(); - assert_eq!(df_schema, arrow_schema.to_dfschema().unwrap()); - assert_eq!(df_schema, arrow_schema_ref.to_dfschema().unwrap()); + assert_eq!(df_schema, logical_schema.to_dfschema().unwrap()); + assert_eq!(df_schema, logical_schema_ref.to_dfschema().unwrap()); } { - let arrow_schema = arrow_schema.clone(); - let arrow_schema_ref = arrow_schema_ref.clone(); + let logical_schema = logical_schema.clone(); + let logical_schema_ref = logical_schema_ref.clone(); - assert_eq!(df_schema_ref, arrow_schema.to_dfschema_ref().unwrap()); - assert_eq!(df_schema_ref, arrow_schema_ref.to_dfschema_ref().unwrap()); + assert_eq!(df_schema_ref, logical_schema.to_dfschema_ref().unwrap()); + assert_eq!(df_schema_ref, logical_schema_ref.to_dfschema_ref().unwrap()); } // Now, consume the refs - assert_eq!(df_schema_ref, arrow_schema.to_dfschema_ref().unwrap()); - assert_eq!(df_schema_ref, arrow_schema_ref.to_dfschema_ref().unwrap()); + assert_eq!(df_schema_ref, logical_schema.to_dfschema_ref().unwrap()); + assert_eq!(df_schema_ref, logical_schema_ref.to_dfschema_ref().unwrap()); } - fn test_schema_1() -> Schema { + fn test_schema_1() -> LogicalPhysicalSchema { Schema::new(vec![ Field::new("c0", DataType::Boolean, true), Field::new("c1", DataType::Boolean, true), ]) + .into() } #[test] fn test_dfschema_to_schema_convertion() { @@ -1319,7 +1295,7 @@ mod tests { b_metadata.insert("key".to_string(), "value".to_string()); let b_field = Field::new("b", DataType::Int64, false).with_metadata(b_metadata); - let schema = Arc::new(Schema::new(vec![a_field, b_field])); + let schema = LogicalPhysicalSchemaRef::new(Schema::new(vec![a_field, b_field]).into()); let df_schema = DFSchema { inner: schema.clone(), @@ -1363,10 +1339,10 @@ mod tests { Ok(()) } - fn test_schema_2() -> Schema { - Schema::new(vec![ - Field::new("c100", DataType::Boolean, true), - Field::new("c101", DataType::Boolean, true), + fn test_schema_2() -> LogicalPhysicalSchema { + LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("c100", DataType::Boolean, true), + LogicalPhysicalField::new("c101", DataType::Boolean, true), ]) } diff --git a/datafusion/common/src/lib.rs b/datafusion/common/src/lib.rs index c275152642f0..9b32f325b698 100644 --- a/datafusion/common/src/lib.rs +++ b/datafusion/common/src/lib.rs @@ -35,6 +35,7 @@ pub mod file_options; pub mod format; pub mod hash_utils; pub mod instant; +pub mod logical_type; pub mod parsers; pub mod rounding; pub mod scalar; diff --git a/datafusion/common/src/logical_type/field.rs b/datafusion/common/src/logical_type/field.rs new file mode 100644 index 000000000000..5220b788a243 --- /dev/null +++ b/datafusion/common/src/logical_type/field.rs @@ -0,0 +1,157 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::hash::{Hash, Hasher}; +use std::sync::Arc; + +use arrow_schema::{Field, FieldRef}; + +use super::{TypeRelation, LogicalPhysicalType}; + +pub type LogicalPhysicalFieldRef = Arc; + +#[derive(Debug, Clone)] +pub struct LogicalPhysicalField { + name: String, + data_type: LogicalPhysicalType, + nullable: bool, + metadata: HashMap, +} + +impl From<&Field> for LogicalPhysicalField { + fn from(value: &Field) -> Self { + Self::new(value.name().clone(), value.data_type(), value.is_nullable()) + } +} + +impl From for LogicalPhysicalField { + fn from(value: Field) -> Self { + Self::from(&value) + } +} + +impl From<&FieldRef> for LogicalPhysicalField { + fn from(value: &FieldRef) -> Self { + Self::from(value.as_ref()) + } +} + +impl From for LogicalPhysicalField { + fn from(value: FieldRef) -> Self { + Self::from(value.as_ref()) + } +} + +impl Into for LogicalPhysicalField { + fn into(self) -> Field { + Field::new(self.name, self.data_type.physical().clone(), self.nullable) + .with_metadata(self.metadata) + } +} + +impl PartialEq for LogicalPhysicalField { + fn eq(&self, other: &Self) -> bool { + self.name == other.name + && self.data_type == other.data_type + && self.nullable == other.nullable + && self.metadata == other.metadata + } +} + +impl Eq for LogicalPhysicalField {} + +impl Hash for LogicalPhysicalField { + fn hash(&self, state: &mut H) { + self.name.hash(state); + self.data_type.hash(state); + self.nullable.hash(state); + + // ensure deterministic key order + let mut keys: Vec<&String> = self.metadata.keys().collect(); + keys.sort(); + for k in keys { + k.hash(state); + self.metadata.get(k).expect("key valid").hash(state); + } + } +} + +impl LogicalPhysicalField { + pub fn new( + name: impl Into, + data_type: impl Into, + nullable: bool, + ) -> Self { + LogicalPhysicalField { + name: name.into(), + data_type: data_type.into(), + nullable, + metadata: HashMap::default(), + } + } + + pub fn new_list_field(data_type: impl Into, nullable: bool) -> Self { + Self::new("item", data_type, nullable) + } + + pub fn name(&self) -> &str { + &self.name + } + + pub fn data_type(&self) -> &LogicalPhysicalType { + &self.data_type + } + + pub fn is_nullable(&self) -> bool { + self.nullable + } + + pub fn metadata(&self) -> &HashMap { + &self.metadata + } + + #[inline] + pub fn with_name(mut self, name: impl Into) -> Self { + self.name = name.into(); + self + } + + #[inline] + pub fn with_nullable(mut self, nullable: bool) -> Self { + self.nullable = nullable; + self + } + + #[inline] + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + self.metadata = metadata; + self + } + + #[inline] + pub fn with_data_type(mut self, data_type: LogicalPhysicalType) -> Self { + self.data_type = data_type; + self + } +} + +impl std::fmt::Display for LogicalPhysicalField { + fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(f, "{self:?}") + } +} diff --git a/datafusion/common/src/logical_type/fields.rs b/datafusion/common/src/logical_type/fields.rs new file mode 100644 index 000000000000..4e0dd2291c60 --- /dev/null +++ b/datafusion/common/src/logical_type/fields.rs @@ -0,0 +1,165 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::ops::Deref; +use std::sync::Arc; + +use arrow_schema::{Field, FieldRef, Fields, UnionFields}; + +use super::field::{LogicalPhysicalField, LogicalPhysicalFieldRef}; + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct LogicalPhysicalFields(Arc<[LogicalPhysicalFieldRef]>); + +impl std::fmt::Debug for LogicalPhysicalFields { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.as_ref().fmt(f) + } +} + +impl From<&Fields> for LogicalPhysicalFields { + fn from(value: &Fields) -> Self { + Self( + value + .iter() + .map(|v| LogicalPhysicalFieldRef::new(v.into())) + .collect(), + ) + } +} + +impl From for LogicalPhysicalFields { + fn from(value: Fields) -> Self { + Self::from(&value) + } +} + +impl Into for LogicalPhysicalFields { + fn into(self) -> Fields { + Fields::from( + self.iter() + .map(|f| f.as_ref().clone().into()) + .collect::>(), + ) + } +} + +impl Default for LogicalPhysicalFields { + fn default() -> Self { + Self::empty() + } +} + +impl FromIterator for LogicalPhysicalFields { + fn from_iter>(iter: T) -> Self { + iter.into_iter().map(Arc::new).collect() + } +} + +impl FromIterator for LogicalPhysicalFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From> for LogicalPhysicalFields { + fn from(value: Vec) -> Self { + value.into_iter().collect() + } +} + +impl From> for LogicalPhysicalFields { + fn from(value: Vec) -> Self { + Self(value.into()) + } +} + +impl From<&[LogicalPhysicalFieldRef]> for LogicalPhysicalFields { + fn from(value: &[LogicalPhysicalFieldRef]) -> Self { + Self(value.into()) + } +} + +impl From<[LogicalPhysicalFieldRef; N]> for LogicalPhysicalFields { + fn from(value: [LogicalPhysicalFieldRef; N]) -> Self { + Self(Arc::new(value)) + } +} + +impl Deref for LogicalPhysicalFields { + type Target = [LogicalPhysicalFieldRef]; + + fn deref(&self) -> &Self::Target { + self.0.as_ref() + } +} + +impl<'a> IntoIterator for &'a LogicalPhysicalFields { + type Item = &'a LogicalPhysicalFieldRef; + type IntoIter = std::slice::Iter<'a, LogicalPhysicalFieldRef>; + + fn into_iter(self) -> Self::IntoIter { + self.0.iter() + } +} + +impl LogicalPhysicalFields { + pub fn empty() -> Self { + Self(Arc::new([])) + } +} + +#[derive(Clone, Eq, PartialEq, Hash)] +pub struct LogicalUnionFields(Arc<[(i8, LogicalPhysicalFieldRef)]>); + +impl std::fmt::Debug for LogicalUnionFields { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + self.0.as_ref().fmt(f) + } +} + +impl FromIterator<(i8, LogicalPhysicalFieldRef)> for LogicalUnionFields { + fn from_iter>(iter: T) -> Self { + Self(iter.into_iter().collect()) + } +} + +impl From<&UnionFields> for LogicalUnionFields { + fn from(value: &UnionFields) -> Self { + Self::from_iter( + value + .iter() + .map(|(i, f)| (i, LogicalPhysicalFieldRef::new(f.into()))), + ) + } +} + +impl From for LogicalUnionFields { + fn from(value: UnionFields) -> Self { + Self::from(&value) + } +} + +impl Into for LogicalUnionFields { + fn into(self) -> UnionFields { + UnionFields::from_iter( + self.0 + .into_iter() + .map(|(i, f)| (*i, FieldRef::new(f.as_ref().clone().into()))), + ) + } +} diff --git a/datafusion/common/src/logical_type/mod.rs b/datafusion/common/src/logical_type/mod.rs new file mode 100644 index 000000000000..c59946719a21 --- /dev/null +++ b/datafusion/common/src/logical_type/mod.rs @@ -0,0 +1,202 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::{fmt::Display, sync::Arc}; + +use arrow_schema::DataType; + +use field::{LogicalPhysicalField, LogicalPhysicalFieldRef}; +use fields::LogicalPhysicalFields; +use signature::LogicalType; + +pub mod field; +pub mod fields; +pub mod schema; +pub mod signature; + +pub type TypeRelationRef = Arc; + +pub trait TypeRelation: std::fmt::Debug { + fn logical(&self) -> &LogicalType; + fn physical(&self) -> &DataType; +} + +#[derive(Clone, Debug)] +pub struct LogicalPhysicalType(TypeRelationRef); + +impl LogicalPhysicalType { + pub fn new_list(inner: LogicalPhysicalType, nullable: bool) -> Self { + Self(Arc::new(NativeType::new_list(inner, nullable))) + } + + pub fn new_struct(fields: LogicalPhysicalFields) -> Self { + Self(Arc::new(NativeType::new_struct(fields))) + } +} + +pub type NativeTypeRef = Arc; + +#[derive(Clone, Debug)] +pub struct NativeType { + logical: LogicalType, + physical: DataType, +} + +impl TypeRelation for NativeType { + fn logical(&self) -> &LogicalType { + &self.logical + } + + fn physical(&self) -> &DataType { + &self.physical + } +} + +impl NativeType { + pub fn new_list(inner: LogicalPhysicalType, nullable: bool) -> Self { + Self { + physical: DataType::new_list(inner.physical().clone(), nullable), + logical: LogicalType::List(LogicalPhysicalFieldRef::new( + LogicalPhysicalField::new_list_field(inner, nullable), + )), + } + } + + pub fn new_struct(fields: LogicalPhysicalFields) -> Self { + Self { + physical: DataType::Struct(fields.clone().into()), + logical: LogicalType::Struct(fields), + } + } +} + +impl TypeRelation for LogicalPhysicalType { + fn logical(&self) -> &LogicalType { + self.0.logical() + } + + fn physical(&self) -> &DataType { + self.0.physical() + } +} + +impl PartialEq for LogicalPhysicalType { + fn eq(&self, other: &Self) -> bool { + self.logical() == other.logical() + } +} + +impl Eq for LogicalPhysicalType {} + +impl std::hash::Hash for LogicalPhysicalType { + fn hash(&self, state: &mut H) { + self.logical().hash(state) + } +} + +impl Display for LogicalPhysicalType { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{self:?}") + } +} + +impl From for NativeType { + fn from(value: DataType) -> Self { + Self { + logical: (&value).into(), + physical: value, + } + } +} + +impl From for LogicalPhysicalType { + fn from(value: TypeRelationRef) -> Self { + Self(value) + } +} + +impl From<&DataType> for LogicalPhysicalType { + fn from(value: &DataType) -> Self { + value.clone().into() + } +} + +impl From for LogicalPhysicalType { + fn from(value: DataType) -> Self { + Self(NativeTypeRef::new(value.into())) + } +} + +impl From for LogicalType { + fn from(value: DataType) -> Self { + (&value).into() + } +} + +impl From<&DataType> for LogicalType { + fn from(value: &DataType) -> Self { + match value { + DataType::Null => LogicalType::Null, + DataType::Boolean => LogicalType::Boolean, + DataType::Int8 => LogicalType::Int8, + DataType::Int16 => LogicalType::Int16, + DataType::Int32 => LogicalType::Int32, + DataType::Int64 => LogicalType::Int64, + DataType::UInt8 => LogicalType::UInt8, + DataType::UInt16 => LogicalType::UInt16, + DataType::UInt32 => LogicalType::UInt32, + DataType::UInt64 => LogicalType::UInt64, + DataType::Float16 => LogicalType::Float16, + DataType::Float32 => LogicalType::Float32, + DataType::Float64 => LogicalType::Float64, + DataType::Timestamp(tu, tz) => LogicalType::Timestamp(tu.clone(), tz.clone()), + DataType::Date32 | DataType::Date64 => LogicalType::Date, + DataType::Time32(tu) => LogicalType::Time32(tu.clone()), + DataType::Time64(tu) => LogicalType::Time64(tu.clone()), + DataType::Duration(tu) => LogicalType::Duration(tu.clone()), + DataType::Interval(iu) => LogicalType::Interval(iu.clone()), + DataType::Binary + | DataType::FixedSizeBinary(_) + | DataType::LargeBinary + | DataType::BinaryView => LogicalType::Binary, + DataType::Utf8 | DataType::LargeUtf8 | DataType::Utf8View => { + LogicalType::Utf8 + } + DataType::List(f) + | DataType::ListView(f) + | DataType::FixedSizeList(f, _) + | DataType::LargeList(f) + | DataType::LargeListView(f) => { + LogicalType::List(LogicalPhysicalFieldRef::new(f.as_ref().clone().into())) + } + DataType::Struct(f) => LogicalType::Struct(f.clone().into()), + DataType::Dictionary(_, t) => t.as_ref().into(), + DataType::Decimal128(precision, scale) => { + LogicalType::Decimal128(precision.clone(), scale.clone()) + } + DataType::Decimal256(precision, scale) => { + LogicalType::Decimal256(precision.clone(), scale.clone()) + } + DataType::Map(f, sorted) => LogicalType::Map( + LogicalPhysicalFieldRef::new(f.as_ref().clone().into()), + sorted.clone(), + ), + DataType::RunEndEncoded(_, f) => f.data_type().into(), + DataType::Union(f, mode) => LogicalType::Union(f.into(), mode.clone()), + } + } +} diff --git a/datafusion/common/src/logical_type/schema.rs b/datafusion/common/src/logical_type/schema.rs new file mode 100644 index 000000000000..856060162253 --- /dev/null +++ b/datafusion/common/src/logical_type/schema.rs @@ -0,0 +1,210 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use std::collections::HashMap; +use std::sync::Arc; + +use arrow_schema::{Schema, SchemaRef}; + +use super::field::{LogicalPhysicalField, LogicalPhysicalFieldRef}; +use super::fields::LogicalPhysicalFields; + +#[derive(Debug, Default)] +pub struct LogicalPhysicalSchemaBuilder { + fields: Vec, + metadata: HashMap, +} + +impl LogicalPhysicalSchemaBuilder { + pub fn new() -> Self { + Self::default() + } + + pub fn with_capacity(capacity: usize) -> Self { + Self { + fields: Vec::with_capacity(capacity), + metadata: Default::default(), + } + } + + pub fn push(&mut self, field: impl Into) { + self.fields.push(field.into()) + } + + pub fn remove(&mut self, idx: usize) -> LogicalPhysicalFieldRef { + self.fields.remove(idx) + } + + pub fn field(&mut self, idx: usize) -> &LogicalPhysicalFieldRef { + &mut self.fields[idx] + } + + pub fn field_mut(&mut self, idx: usize) -> &mut LogicalPhysicalFieldRef { + &mut self.fields[idx] + } + + pub fn metadata(&mut self) -> &HashMap { + &self.metadata + } + + pub fn metadata_mut(&mut self) -> &mut HashMap { + &mut self.metadata + } + + pub fn reverse(&mut self) { + self.fields.reverse(); + } + + pub fn finish(self) -> LogicalPhysicalSchema { + LogicalPhysicalSchema { + fields: self.fields.into(), + metadata: self.metadata, + } + } +} + +impl From<&LogicalPhysicalFields> for LogicalPhysicalSchemaBuilder { + fn from(value: &LogicalPhysicalFields) -> Self { + Self { + fields: value.to_vec(), + metadata: Default::default(), + } + } +} + +impl From for LogicalPhysicalSchemaBuilder { + fn from(value: LogicalPhysicalFields) -> Self { + Self { + fields: value.to_vec(), + metadata: Default::default(), + } + } +} + +impl From<&LogicalPhysicalSchema> for LogicalPhysicalSchemaBuilder { + fn from(value: &LogicalPhysicalSchema) -> Self { + Self::from(value.clone()) + } +} + +impl From for LogicalPhysicalSchemaBuilder { + fn from(value: LogicalPhysicalSchema) -> Self { + Self { + fields: value.fields.to_vec(), + metadata: value.metadata, + } + } +} + +impl Extend for LogicalPhysicalSchemaBuilder { + fn extend>(&mut self, iter: T) { + let iter = iter.into_iter(); + self.fields.reserve(iter.size_hint().0); + for f in iter { + self.push(f) + } + } +} + +impl Extend for LogicalPhysicalSchemaBuilder { + fn extend>(&mut self, iter: T) { + let iter = iter.into_iter(); + self.fields.reserve(iter.size_hint().0); + for f in iter { + self.push(f) + } + } +} + +pub type LogicalPhysicalSchemaRef = Arc; + +#[derive(Debug, Clone, PartialEq, Eq)] +pub struct LogicalPhysicalSchema { + pub fields: LogicalPhysicalFields, + pub metadata: HashMap, +} + +impl From for LogicalPhysicalSchema { + fn from(value: Schema) -> Self { + Self { + fields: value.fields.into(), + metadata: value.metadata, + } + } +} + +impl From<&Schema> for LogicalPhysicalSchema { + fn from(value: &Schema) -> Self { + Self::from(value.clone()) + } +} + +impl From for LogicalPhysicalSchema { + fn from(value: SchemaRef) -> Self { + Self::from(value.as_ref()) + } +} + +impl From<&SchemaRef> for LogicalPhysicalSchema { + fn from(value: &SchemaRef) -> Self { + Self::from(value.as_ref()) + } +} + +impl Into for LogicalPhysicalSchema { + fn into(self) -> Schema { + Schema { + fields: self.fields.into(), + metadata: self.metadata, + } + } +} + +impl LogicalPhysicalSchema { + pub fn new(fields: impl Into) -> Self { + Self::new_with_metadata(fields, HashMap::new()) + } + + #[inline] + pub fn new_with_metadata( + fields: impl Into, + metadata: HashMap, + ) -> Self { + Self { + fields: fields.into(), + metadata, + } + } + + #[inline] + pub fn with_metadata(mut self, metadata: HashMap) -> Self { + self.metadata = metadata; + self + } + + pub fn metadata(&self) -> &HashMap { + &self.metadata + } + + pub fn field(&self, i: usize) -> &LogicalPhysicalFieldRef { + &self.fields[i] + } + + pub fn fields(&self) -> &LogicalPhysicalFields { + &self.fields + } +} diff --git a/datafusion/common/src/logical_type/signature.rs b/datafusion/common/src/logical_type/signature.rs new file mode 100644 index 000000000000..5e35870cc6aa --- /dev/null +++ b/datafusion/common/src/logical_type/signature.rs @@ -0,0 +1,175 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use core::fmt; +use std::sync::Arc; + +use arrow_schema::{DataType, FieldRef, IntervalUnit, TimeUnit, UnionMode}; + +use super::{ + field::LogicalPhysicalFieldRef, + fields::{LogicalPhysicalFields, LogicalUnionFields}, +}; + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum LogicalType { + Null, + Int8, + Int16, + Int32, + Int64, + UInt8, + UInt16, + UInt32, + UInt64, + Boolean, + Float16, + Float32, + Float64, + Utf8, + Binary, + Date, + Time32(TimeUnit), + Time64(TimeUnit), + Timestamp(TimeUnit, Option>), + Duration(TimeUnit), + Interval(IntervalUnit), + List(LogicalPhysicalFieldRef), + Struct(LogicalPhysicalFields), + Map(LogicalPhysicalFieldRef, bool), + Decimal128(u8, i8), + Decimal256(u8, i8), + Union(LogicalUnionFields, UnionMode), // TODO: extension signatures? +} + +impl fmt::Display for LogicalType { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{self:?}") + } +} + +impl LogicalType { + /// Returns true if the type is primitive: (numeric, temporal). + #[inline] + pub fn is_primitive(&self) -> bool { + self.is_numeric() || self.is_temporal() + } + + /// Returns true if this type is numeric: (UInt*, Int*, Float*, Decimal*). + #[inline] + pub fn is_numeric(&self) -> bool { + use LogicalType::*; + matches!( + self, + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Float16 + | Float32 + | Float64 + | Decimal128(_, _) + | Decimal256(_, _) + ) + } + + /// Returns true if this type is temporal: (Date*, Time*, Duration, or Interval). + #[inline] + pub fn is_temporal(&self) -> bool { + use LogicalType::*; + matches!( + self, + Date | Timestamp(_, _) | Time32(_) | Time64(_) | Duration(_) | Interval(_) + ) + } + + /// Returns true if this type is floating: (Float*). + #[inline] + pub fn is_floating(&self) -> bool { + use LogicalType::*; + matches!(self, Float16 | Float32 | Float64) + } + + /// Returns true if this type is integer: (Int*, UInt*). + #[inline] + pub fn is_integer(&self) -> bool { + self.is_signed_integer() || self.is_unsigned_integer() + } + + /// Returns true if this type is signed integer: (Int*). + #[inline] + pub fn is_signed_integer(&self) -> bool { + use LogicalType::*; + matches!(self, Int8 | Int16 | Int32 | Int64) + } + + /// Returns true if this type is unsigned integer: (UInt*). + #[inline] + pub fn is_unsigned_integer(&self) -> bool { + use LogicalType::*; + matches!(self, UInt8 | UInt16 | UInt32 | UInt64) + } + + /// Returns true if this type is TypeSignature::Null. + #[inline] + pub fn is_null(&self) -> bool { + use LogicalType::*; + matches!(self, Null) + } +} + +impl Into for LogicalType { + fn into(self) -> DataType { + match self { + LogicalType::Null => DataType::Null, + LogicalType::Int8 => DataType::Int8, + LogicalType::Int16 => DataType::Int16, + LogicalType::Int32 => DataType::Int32, + LogicalType::Int64 => DataType::Int64, + LogicalType::UInt8 => DataType::UInt8, + LogicalType::UInt16 => DataType::UInt16, + LogicalType::UInt32 => DataType::UInt32, + LogicalType::UInt64 => DataType::UInt64, + LogicalType::Boolean => DataType::Boolean, + LogicalType::Float16 => DataType::Float16, + LogicalType::Float32 => DataType::Float32, + LogicalType::Float64 => DataType::Float64, + LogicalType::Utf8 => DataType::Utf8, + LogicalType::Binary => DataType::Binary, + LogicalType::Date => DataType::Date32, + LogicalType::Time32(tu) => DataType::Time32(tu), + LogicalType::Time64(tu) => DataType::Time64(tu), + LogicalType::Timestamp(tu, tz) => DataType::Timestamp(tu, tz), + LogicalType::Duration(tu) => DataType::Duration(tu), + LogicalType::Interval(iu) => DataType::Interval(iu), + LogicalType::List(field) => { + DataType::List(FieldRef::new(field.as_ref().clone().into())) + } + LogicalType::Struct(fields) => DataType::Struct(fields.into()), + LogicalType::Map(field, v) => { + DataType::Map(FieldRef::new(field.as_ref().clone().into()), v) + } + LogicalType::Decimal128(a, b) => DataType::Decimal128(a, b), + LogicalType::Decimal256(a, b) => DataType::Decimal256(a, b), + LogicalType::Union(union, mode) => DataType::Union(union.into(), mode), + } + } +} diff --git a/datafusion/common/src/param_value.rs b/datafusion/common/src/param_value.rs index 8d61bad97b9f..8042f0663ba0 100644 --- a/datafusion/common/src/param_value.rs +++ b/datafusion/common/src/param_value.rs @@ -16,8 +16,8 @@ // under the License. use crate::error::{_plan_datafusion_err, _plan_err}; +use crate::logical_type::LogicalPhysicalType; use crate::{Result, ScalarValue}; -use arrow_schema::DataType; use std::collections::HashMap; /// The parameter value corresponding to the placeholder @@ -31,7 +31,7 @@ pub enum ParamValues { impl ParamValues { /// Verify parameter list length and type - pub fn verify(&self, expect: &[DataType]) -> Result<()> { + pub fn verify(&self, expect: &[LogicalPhysicalType]) -> Result<()> { match self { ParamValues::List(list) => { // Verify if the number of params matches the number of values @@ -46,7 +46,7 @@ impl ParamValues { // Verify if the types of the params matches the types of the values let iter = expect.iter().zip(list.iter()); for (i, (param_type, value)) in iter.enumerate() { - if *param_type != value.data_type() { + if *param_type != value.data_type().into() { return _plan_err!( "Expected parameter of type {:?}, got {:?} at index {}", param_type, diff --git a/datafusion/core/src/catalog/information_schema.rs b/datafusion/core/src/catalog/information_schema.rs index c953de6d16d3..55ec1490972a 100644 --- a/datafusion/core/src/catalog/information_schema.rs +++ b/datafusion/core/src/catalog/information_schema.rs @@ -186,7 +186,7 @@ impl InformationSchemaConfig { &schema_name, &table_name, field_position, - field, + &field.as_ref().clone().into(), ) } } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index d0f2852a6e53..583663169ed9 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -46,8 +46,9 @@ use crate::prelude::SessionContext; use arrow::array::{Array, ArrayRef, Int64Array, StringArray}; use arrow::compute::{cast, concat}; use arrow::datatypes::{DataType, Field}; -use arrow_schema::{Schema, SchemaRef}; +use arrow_schema::Schema; use datafusion_common::config::{CsvOptions, JsonOptions}; +use datafusion_common::logical_type::signature::LogicalType; use datafusion_common::{ plan_err, Column, DFSchema, DataFusionError, ParamValues, SchemaError, UnnestOptions, }; @@ -58,6 +59,8 @@ use datafusion_expr::{ use datafusion_functions_aggregate::expr_fn::{avg, count, median, stddev, sum}; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; +use datafusion_common::logical_type::TypeRelation; /// Contains options that control how data is /// written out from a DataFrame @@ -611,7 +614,7 @@ impl DataFrame { //define describe column let mut describe_schemas = vec![Field::new("describe", DataType::Utf8, false)]; describe_schemas.extend(original_schema_fields.clone().map(|field| { - if field.data_type().is_numeric() { + if field.data_type().logical().is_numeric() { Field::new(field.name(), DataType::Float64, true) } else { Field::new(field.name(), DataType::Utf8, true) @@ -647,7 +650,7 @@ impl DataFrame { vec![], original_schema_fields .clone() - .filter(|f| f.data_type().is_numeric()) + .filter(|f| f.data_type().logical().is_numeric()) .map(|f| avg(col(f.name())).alias(f.name())) .collect::>(), ), @@ -656,7 +659,7 @@ impl DataFrame { vec![], original_schema_fields .clone() - .filter(|f| f.data_type().is_numeric()) + .filter(|f| f.data_type().logical().is_numeric()) .map(|f| stddev(col(f.name())).alias(f.name())) .collect::>(), ), @@ -666,7 +669,10 @@ impl DataFrame { original_schema_fields .clone() .filter(|f| { - !matches!(f.data_type(), DataType::Binary | DataType::Boolean) + !matches!( + f.data_type().logical(), + LogicalType::Binary | LogicalType::Boolean + ) }) .map(|f| min(col(f.name())).alias(f.name())) .collect::>(), @@ -677,7 +683,10 @@ impl DataFrame { original_schema_fields .clone() .filter(|f| { - !matches!(f.data_type(), DataType::Binary | DataType::Boolean) + !matches!( + f.data_type().logical(), + LogicalType::Binary | LogicalType::Boolean + ) }) .map(|f| max(col(f.name())).alias(f.name())) .collect::>(), @@ -687,7 +696,7 @@ impl DataFrame { vec![], original_schema_fields .clone() - .filter(|f| f.data_type().is_numeric()) + .filter(|f| f.data_type().logical().is_numeric()) .map(|f| median(col(f.name())).alias(f.name())) .collect::>(), ), @@ -712,7 +721,7 @@ impl DataFrame { { let column = batchs[0].column_by_name(field.name()).unwrap(); - if field.data_type().is_numeric() { + if field.data_type().logical().is_numeric() { cast(column, &DataType::Float64)? } else { cast(column, &DataType::Utf8)? @@ -1285,7 +1294,7 @@ impl DataFrame { let plan = LogicalPlanBuilder::insert_into( self.plan, table_name.to_owned(), - &arrow_schema, + &arrow_schema.into(), write_options.overwrite, )? .build()?; @@ -1472,7 +1481,7 @@ impl DataFrame { /// /// The method supports case sensitive rename with wrapping column name into one of following symbols ( " or ' or ` ) /// - /// Alternatively setting Datafusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable + /// Alternatively setting Datafusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable /// case sensitive rename without need to wrap column name into special symbols /// /// # Example @@ -1647,9 +1656,8 @@ impl TableProvider for DataFrameTableProvider { Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) } - fn schema(&self) -> SchemaRef { - let schema: Schema = self.plan.schema().as_ref().into(); - Arc::new(schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + self.plan.schema().inner().clone() } fn table_type(&self) -> TableType { @@ -1694,6 +1702,7 @@ mod tests { use crate::test_util::{register_aggregate_csv, test_table, test_table_with_name}; use arrow::array::{self, Int32Array}; + use arrow_schema::SchemaRef; use datafusion_common::{Constraint, Constraints}; use datafusion_common_runtime::SpawnedTask; use datafusion_expr::{ @@ -2362,7 +2371,7 @@ mod tests { let field = df.schema().field(0); // There are two columns named 'c', one from the input of the aggregate and the other from the output. // Select should return the column from the output of the aggregate, which is a list. - assert!(matches!(field.data_type(), DataType::List(_))); + assert!(matches!(field.data_type().logical(), LogicalType::List(_))); Ok(()) } diff --git a/datafusion/core/src/datasource/cte_worktable.rs b/datafusion/core/src/datasource/cte_worktable.rs index afc4536f068e..570e8afb2ea7 100644 --- a/datafusion/core/src/datasource/cte_worktable.rs +++ b/datafusion/core/src/datasource/cte_worktable.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_physical_plan::work_table::WorkTableExec; use crate::{ @@ -67,8 +68,8 @@ impl TableProvider for CteWorkTable { None } - fn schema(&self) -> SchemaRef { - self.table_schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.table_schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/datasource/default_table_source.rs b/datafusion/core/src/datasource/default_table_source.rs index 977e681d6641..4b3cb695413d 100644 --- a/datafusion/core/src/datasource/default_table_source.rs +++ b/datafusion/core/src/datasource/default_table_source.rs @@ -22,7 +22,7 @@ use std::sync::Arc; use crate::datasource::TableProvider; -use arrow::datatypes::SchemaRef; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{internal_err, Constraints}; use datafusion_expr::{Expr, TableProviderFilterPushDown, TableSource}; @@ -52,7 +52,7 @@ impl TableSource for DefaultTableSource { } /// Get a reference to the schema for this table - fn schema(&self) -> SchemaRef { + fn schema(&self) -> LogicalPhysicalSchemaRef { self.table_provider.schema() } diff --git a/datafusion/core/src/datasource/empty.rs b/datafusion/core/src/datasource/empty.rs index 5100987520ee..e2c4ff1bbbe4 100644 --- a/datafusion/core/src/datasource/empty.rs +++ b/datafusion/core/src/datasource/empty.rs @@ -22,6 +22,7 @@ use std::sync::Arc; use arrow::datatypes::*; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::project_schema; use crate::datasource::{TableProvider, TableType}; @@ -59,8 +60,8 @@ impl TableProvider for EmptyTable { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/datasource/listing/helpers.rs b/datafusion/core/src/datasource/listing/helpers.rs index c1ce4cc5b6c5..2d041e4a1c18 100644 --- a/datafusion/core/src/datasource/listing/helpers.rs +++ b/datafusion/core/src/datasource/listing/helpers.rs @@ -38,6 +38,7 @@ use futures::stream::FuturesUnordered; use futures::{stream::BoxStream, StreamExt, TryStreamExt}; use log::{debug, trace}; +use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_common::tree_node::{TreeNode, TreeNodeRecursion}; use datafusion_common::{Column, DFSchema, DataFusionError}; use datafusion_expr::{Expr, Volatility}; @@ -264,7 +265,7 @@ async fn prune_partitions( let df_schema = DFSchema::from_unqualified_fields( partition_cols .iter() - .map(|(n, d)| Field::new(n, d.clone(), true)) + .map(|(n, d)| LogicalPhysicalField::new(n, d.clone(), true)) .collect(), Default::default(), )?; diff --git a/datafusion/core/src/datasource/listing/table.rs b/datafusion/core/src/datasource/listing/table.rs index ea4d396a14cb..b43751c39161 100644 --- a/datafusion/core/src/datasource/listing/table.rs +++ b/datafusion/core/src/datasource/listing/table.rs @@ -52,6 +52,7 @@ use datafusion_physical_expr::{ }; use async_trait::async_trait; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; use futures::{future, stream, StreamExt, TryStreamExt}; use itertools::Itertools; use object_store::ObjectStore; @@ -722,8 +723,8 @@ impl TableProvider for ListingTable { self } - fn schema(&self) -> SchemaRef { - Arc::clone(&self.table_schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.table_schema.clone().into()) } fn constraints(&self) -> Option<&Constraints> { @@ -746,7 +747,8 @@ impl TableProvider for ListingTable { // if no files need to be read, return an `EmptyExec` if partitioned_file_lists.is_empty() { - let projected_schema = project_schema(&self.schema(), projection)?; + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); + let projected_schema = project_schema(&schema, projection)?; return Ok(Arc::new(EmptyExec::new(projected_schema))); } @@ -787,7 +789,8 @@ impl TableProvider for ListingTable { let filters = if let Some(expr) = conjunction(filters.to_vec()) { // NOTE: Use the table schema (NOT file schema) here because `expr` may contain references to partition columns. - let table_df_schema = self.table_schema.as_ref().clone().to_dfschema()?; + let table_df_schema = + LogicalPhysicalSchema::from(self.table_schema.as_ref().clone()).to_dfschema()?; let filters = create_physical_expr(&expr, &table_df_schema, state.execution_props())?; Some(filters) @@ -856,11 +859,9 @@ impl TableProvider for ListingTable { input: Arc, overwrite: bool, ) -> Result> { + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); // Check that the schema of the plan matches the schema of this table. - if !self - .schema() - .logically_equivalent_names_and_types(&input.schema()) - { + if !schema.logically_equivalent_names_and_types(&input.schema()) { return plan_err!( // Return an error if schema of the input query does not match with the table schema. "Inserting query must have the same schema with the table." @@ -897,7 +898,7 @@ impl TableProvider for ListingTable { object_store_url: self.table_paths()[0].object_store(), table_paths: self.table_paths().clone(), file_groups, - output_schema: self.schema(), + output_schema: SchemaRef::new(self.schema().as_ref().clone().into()), table_partition_cols: self.options.table_partition_cols.clone(), overwrite, keep_partition_by_columns, @@ -980,13 +981,10 @@ impl ListingTable { .boxed() .buffered(ctx.config_options().execution.meta_fetch_concurrency); - let (files, statistics) = get_statistics_with_limit( - files, - self.schema(), - limit, - self.options.collect_stat, - ) - .await?; + let schema = SchemaRef::new(self.schema().as_ref().clone().into()); + let (files, statistics) = + get_statistics_with_limit(files, schema, limit, self.options.collect_stat) + .await?; Ok(( split_files(files, self.options.target_partitions), @@ -1251,8 +1249,9 @@ mod tests { .with_schema(file_schema); let table = ListingTable::try_new(config)?; + let table_schema = table.schema().as_ref().clone().into(); assert_eq!( - columns(&table.schema()), + columns(&table_schema), vec!["a".to_owned(), "p1".to_owned()] ); @@ -1878,8 +1877,13 @@ mod tests { // Since logical plan contains a filter, increasing parallelism is helpful. // Therefore, we will have 8 partitions in the final plan. // Create an insert plan to insert the source data into the initial table - let insert_into_table = - LogicalPlanBuilder::insert_into(scan_plan, "t", &schema, false)?.build()?; + let insert_into_table = LogicalPlanBuilder::insert_into( + scan_plan, + "t", + &schema.as_ref().clone().into(), + false, + )? + .build()?; // Create a physical plan from the insert plan let plan = session_ctx .state() diff --git a/datafusion/core/src/datasource/memory.rs b/datafusion/core/src/datasource/memory.rs index aab42285a0b2..1a78f0924b4f 100644 --- a/datafusion/core/src/datasource/memory.rs +++ b/datafusion/core/src/datasource/memory.rs @@ -17,11 +17,6 @@ //! [`MemTable`] for querying `Vec` by DataFusion. -use std::any::Any; -use std::collections::HashMap; -use std::fmt::{self, Debug}; -use std::sync::Arc; - use crate::datasource::{TableProvider, TableType}; use crate::error::Result; use crate::execution::context::SessionState; @@ -34,6 +29,10 @@ use crate::physical_plan::{ Partitioning, SendableRecordBatchStream, }; use crate::physical_planner::create_physical_sort_exprs; +use std::any::Any; +use std::collections::HashMap; +use std::fmt::{self, Debug}; +use std::sync::Arc; use arrow::datatypes::SchemaRef; use arrow::record_batch::RecordBatch; @@ -42,6 +41,7 @@ use datafusion_execution::TaskContext; use datafusion_physical_plan::metrics::MetricsSet; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use futures::StreamExt; use log::debug; use parking_lot::Mutex; @@ -159,6 +159,7 @@ impl MemTable { } } + let schema = SchemaRef::new(schema.as_ref().clone().into()); let exec = MemoryExec::try_new(&data, schema.clone(), None)?; if let Some(num_partitions) = output_partitions { @@ -192,8 +193,8 @@ impl TableProvider for MemTable { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn constraints(&self) -> Option<&Constraints> { @@ -217,8 +218,11 @@ impl TableProvider for MemTable { partitions.push(inner_vec.clone()) } - let mut exec = - MemoryExec::try_new(&partitions, self.schema(), projection.cloned())?; + let mut exec = MemoryExec::try_new( + &partitions, + SchemaRef::new(self.schema().as_ref().clone().into()), + projection.cloned(), + )?; let show_sizes = state.config_options().explain.show_sizes; exec = exec.with_show_sizes(show_sizes); @@ -267,10 +271,8 @@ impl TableProvider for MemTable { // Create a physical plan from the logical plan. // Check that the schema of the plan matches the schema of this table. - if !self - .schema() - .logically_equivalent_names_and_types(&input.schema()) - { + let schema = SchemaRef::new(self.schema.as_ref().clone()); + if !schema.logically_equivalent_names_and_types(&input.schema()) { return plan_err!( "Inserting query must have the same schema with the table." ); @@ -623,8 +625,13 @@ mod tests { // Create a table scan logical plan to read from the source table let scan_plan = LogicalPlanBuilder::scan("source", source, None)?.build()?; // Create an insert plan to insert the source data into the initial table - let insert_into_table = - LogicalPlanBuilder::insert_into(scan_plan, "t", &schema, false)?.build()?; + let insert_into_table = LogicalPlanBuilder::insert_into( + scan_plan, + "t", + &schema.as_ref().clone().into(), + false, + )? + .build()?; // Create a physical plan from the insert plan let plan = session_ctx .state() diff --git a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs index f5d3c7a6410d..684a30c50cfa 100644 --- a/datafusion/core/src/datasource/physical_plan/file_scan_config.rs +++ b/datafusion/core/src/datasource/physical_plan/file_scan_config.rs @@ -625,10 +625,9 @@ fn create_output_array( #[cfg(test)] mod tests { - use arrow_array::Int32Array; - use super::*; use crate::{test::columns, test_util::aggr_test_schema}; + use arrow_array::Int32Array; #[test] fn physical_plan_config_no_projection() { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs index 9bc79805746f..32fd471d9af2 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_groups.rs @@ -411,7 +411,6 @@ mod tests { use crate::datasource::physical_plan::parquet::reader::ParquetFileReader; use crate::physical_plan::metrics::ExecutionPlanMetricsSet; - use arrow::datatypes::DataType::Decimal128; use arrow::datatypes::{DataType, Field}; use datafusion_common::Result; use datafusion_expr::{cast, col, lit, Expr}; @@ -821,7 +820,7 @@ mod tests { let schema_descr = get_test_schema_descr(vec![field]); let expr = cast(col("c1"), DataType::Decimal128(11, 2)).gt(cast( lit(ScalarValue::Decimal128(Some(500), 5, 2)), - Decimal128(11, 2), + DataType::Decimal128(11, 2), )); let expr = logical2physical(&expr, &schema); let pruning_predicate = PruningPredicate::try_new(expr, schema.clone()).unwrap(); diff --git a/datafusion/core/src/datasource/provider.rs b/datafusion/core/src/datasource/provider.rs index 7c58aded3108..89f4ea060bc4 100644 --- a/datafusion/core/src/datasource/provider.rs +++ b/datafusion/core/src/datasource/provider.rs @@ -21,11 +21,11 @@ use std::any::Any; use std::sync::Arc; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{not_impl_err, Constraints, Statistics}; use datafusion_expr::{CreateExternalTable, LogicalPlan}; pub use datafusion_expr::{TableProviderFilterPushDown, TableType}; -use crate::arrow::datatypes::SchemaRef; use crate::datasource::listing_table_factory::ListingTableFactory; use crate::datasource::stream::StreamTableFactory; use crate::error::Result; @@ -41,7 +41,7 @@ pub trait TableProvider: Sync + Send { fn as_any(&self) -> &dyn Any; /// Get a reference to the schema for this table - fn schema(&self) -> SchemaRef; + fn schema(&self) -> LogicalPhysicalSchemaRef; /// Get a reference to the constraints of the table. /// Returns: diff --git a/datafusion/core/src/datasource/stream.rs b/datafusion/core/src/datasource/stream.rs index 9cfdb7bb1168..1538ed4b9560 100644 --- a/datafusion/core/src/datasource/stream.rs +++ b/datafusion/core/src/datasource/stream.rs @@ -42,6 +42,7 @@ use datafusion_physical_plan::streaming::{PartitionStream, StreamingTableExec}; use datafusion_physical_plan::{DisplayAs, DisplayFormatType, ExecutionPlan}; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use futures::StreamExt; /// A [`TableProviderFactory`] for [`StreamTable`] @@ -308,8 +309,8 @@ impl TableProvider for StreamTable { self } - fn schema(&self) -> SchemaRef { - self.0.source.schema().clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.0.source.schema().clone().into()) } fn constraints(&self) -> Option<&Constraints> { diff --git a/datafusion/core/src/datasource/streaming.rs b/datafusion/core/src/datasource/streaming.rs index 0ba6f85ec3e2..041a9a46097a 100644 --- a/datafusion/core/src/datasource/streaming.rs +++ b/datafusion/core/src/datasource/streaming.rs @@ -23,14 +23,14 @@ use std::sync::Arc; use arrow::datatypes::SchemaRef; use async_trait::async_trait; -use datafusion_common::{plan_err, Result}; -use datafusion_expr::{Expr, TableType}; -use log::debug; - use crate::datasource::TableProvider; use crate::execution::context::SessionState; use crate::physical_plan::streaming::{PartitionStream, StreamingTableExec}; use crate::physical_plan::ExecutionPlan; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; +use datafusion_common::{plan_err, Result}; +use datafusion_expr::{Expr, TableType}; +use log::debug; /// A [`TableProvider`] that streams a set of [`PartitionStream`] pub struct StreamingTable { @@ -75,8 +75,8 @@ impl TableProvider for StreamingTable { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/datasource/view.rs b/datafusion/core/src/datasource/view.rs index 3f024a6b4cb7..0d3effcd71a1 100644 --- a/datafusion/core/src/datasource/view.rs +++ b/datafusion/core/src/datasource/view.rs @@ -21,6 +21,7 @@ use std::{any::Any, sync::Arc}; use arrow::datatypes::SchemaRef; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::Column; use datafusion_expr::{LogicalPlanBuilder, TableProviderFilterPushDown}; @@ -82,8 +83,8 @@ impl TableProvider for ViewTable { Some(&self.logical_plan) } - fn schema(&self) -> SchemaRef { - Arc::clone(&self.table_schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.table_schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index c123ebb22ecb..f6716cd89630 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -42,13 +42,15 @@ use crate::physical_optimizer::optimizer::PhysicalOptimizer; use crate::physical_optimizer::PhysicalOptimizerRule; use crate::physical_planner::{DefaultPhysicalPlanner, PhysicalPlanner}; use crate::{functions, functions_aggregate}; -use arrow_schema::{DataType, SchemaRef}; +use arrow_schema::SchemaRef; use async_trait::async_trait; use chrono::{DateTime, Utc}; use datafusion_common::alias::AliasGenerator; use datafusion_common::config::{ConfigExtension, ConfigOptions, TableOptions}; use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; use datafusion_common::file_options::file_type::FileType; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::tree_node::TreeNode; use datafusion_common::{ config_err, not_impl_err, plan_datafusion_err, DFSchema, DataFusionError, @@ -1033,7 +1035,7 @@ impl<'a> ContextProvider for SessionContextProvider<'a> { self.state.window_functions().get(name).cloned() } - fn get_variable_type(&self, variable_names: &[String]) -> Option { + fn get_variable_type(&self, variable_names: &[String]) -> Option { if variable_names.is_empty() { return None; } @@ -1262,7 +1264,7 @@ impl<'a> SessionSimplifyProvider<'a> { impl<'a> SimplifyInfo for SessionSimplifyProvider<'a> { fn is_boolean_type(&self, expr: &Expr) -> datafusion_common::Result { - Ok(expr.get_type(self.df_schema)? == DataType::Boolean) + Ok(expr.get_type(self.df_schema)?.logical() == &LogicalType::Boolean) } fn nullable(&self, expr: &Expr) -> datafusion_common::Result { @@ -1273,7 +1275,7 @@ impl<'a> SimplifyInfo for SessionSimplifyProvider<'a> { self.state.execution_props() } - fn get_data_type(&self, expr: &Expr) -> datafusion_common::Result { + fn get_data_type(&self, expr: &Expr) -> datafusion_common::Result { expr.get_type(self.df_schema) } } diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index 6aad4d575532..6c51a73a53b6 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2271,6 +2271,7 @@ mod tests { use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type}; + use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_common::{assert_contains, DFSchemaRef, TableReference}; use datafusion_execution::runtime_env::RuntimeEnv; use datafusion_execution::TaskContext; @@ -2511,24 +2512,7 @@ mod tests { .create_physical_plan(&logical_plan, &session_state) .await; - let expected_error: &str = "Error during planning: \ - Extension planner for NoOp created an ExecutionPlan with mismatched schema. \ - LogicalPlan schema: \ - DFSchema { inner: Schema { fields: \ - [Field { name: \"a\", \ - data_type: Int32, \ - nullable: false, \ - dict_id: 0, \ - dict_is_ordered: false, metadata: {} }], \ - metadata: {} }, field_qualifiers: [None], \ - functional_dependencies: FunctionalDependencies { deps: [] } }, \ - ExecutionPlan schema: Schema { fields: \ - [Field { name: \"b\", \ - data_type: Int32, \ - nullable: false, \ - dict_id: 0, \ - dict_is_ordered: false, metadata: {} }], \ - metadata: {} }"; + let expected_error: &str = r#"Error during planning: Extension planner for NoOp created an ExecutionPlan with mismatched schema. LogicalPlan schema: DFSchema { inner: LogicalSchema { fields: [LogicalField { name: "a", data_type: TypeRelation(NativeType { logical: Int32, physical: Int32 }), nullable: false, metadata: {} }], metadata: {} }, field_qualifiers: [None], functional_dependencies: FunctionalDependencies { deps: [] } }, ExecutionPlan schema: Schema { fields: [Field { name: "b", data_type: Int32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }], metadata: {} }"#; match plan { Ok(_) => panic!("Expected planning failure"), Err(e) => assert!( @@ -2574,7 +2558,7 @@ mod tests { assert_contains!( &e, - r#"Error during planning: Can not find compatible types to compare Boolean with [Struct([Field { name: "foo", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), Utf8]"# + r#"Can not find compatible types to compare TypeRelation(NativeType { logical: Boolean, physical: Boolean }) with [Struct([Field { name: "foo", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }]), Utf8]"# ); Ok(()) @@ -2793,7 +2777,7 @@ mod tests { Self { schema: DFSchemaRef::new( DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Int32, false)].into(), + vec![LogicalPhysicalField::new("a", DataType::Int32, false)].into(), HashMap::new(), ) .unwrap(), diff --git a/datafusion/core/src/test/mod.rs b/datafusion/core/src/test/mod.rs index e8550a79cb0e..1212b572df3e 100644 --- a/datafusion/core/src/test/mod.rs +++ b/datafusion/core/src/test/mod.rs @@ -206,7 +206,7 @@ pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { .schema() .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.name().to_string()) .collect(); assert_eq!(actual, expected); } diff --git a/datafusion/core/src/test/variable.rs b/datafusion/core/src/test/variable.rs index 38207b42cb7b..55915ede3ac5 100644 --- a/datafusion/core/src/test/variable.rs +++ b/datafusion/core/src/test/variable.rs @@ -20,7 +20,8 @@ use crate::error::Result; use crate::scalar::ScalarValue; use crate::variable::VarProvider; -use arrow::datatypes::DataType; +use arrow_schema::DataType; +use datafusion_common::logical_type::LogicalPhysicalType; /// System variable #[derive(Default, Debug)] @@ -40,8 +41,8 @@ impl VarProvider for SystemVar { Ok(ScalarValue::from(s)) } - fn get_type(&self, _: &[String]) -> Option { - Some(DataType::Utf8) + fn get_type(&self, _: &[String]) -> Option { + Some(DataType::Utf8.into()) } } @@ -67,11 +68,11 @@ impl VarProvider for UserDefinedVar { } } - fn get_type(&self, var_names: &[String]) -> Option { + fn get_type(&self, var_names: &[String]) -> Option { if var_names[0] != "@integer" { - Some(DataType::Utf8) + Some(DataType::Utf8.into()) } else { - Some(DataType::Int32) + Some(DataType::Int32.into()) } } } diff --git a/datafusion/core/src/test_util/mod.rs b/datafusion/core/src/test_util/mod.rs index 059fa8fc6da7..6e6d52422219 100644 --- a/datafusion/core/src/test_util/mod.rs +++ b/datafusion/core/src/test_util/mod.rs @@ -49,9 +49,9 @@ use datafusion_expr::{CreateExternalTable, Expr, TableType}; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use futures::Stream; use tempfile::TempDir; - // backwards compatibility #[cfg(feature = "parquet")] pub use datafusion_common::test_util::parquet_test_data; @@ -203,8 +203,8 @@ impl TableProvider for TestTableProvider { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/src/test_util/parquet.rs b/datafusion/core/src/test_util/parquet.rs index 9f06ad9308ab..e44a35e1a2ca 100644 --- a/datafusion/core/src/test_util/parquet.rs +++ b/datafusion/core/src/test_util/parquet.rs @@ -38,6 +38,7 @@ use crate::physical_plan::ExecutionPlan; use crate::prelude::{Expr, SessionConfig, SessionContext}; use crate::datasource::physical_plan::parquet::ParquetExecBuilder; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use object_store::path::Path; use object_store::ObjectMeta; use parquet::arrow::ArrowWriter; @@ -153,7 +154,8 @@ impl TestParquetFile { extensions: None, }); - let df_schema = self.schema.clone().to_dfschema_ref()?; + let df_schema = + LogicalPhysicalSchema::from(self.schema.as_ref().clone()).to_dfschema_ref()?; // run coercion on the filters to coerce types etc. let props = ExecutionProps::new(); diff --git a/datafusion/core/tests/custom_sources_cases/mod.rs b/datafusion/core/tests/custom_sources_cases/mod.rs index eebc946ccb68..7f36c2634fd5 100644 --- a/datafusion/core/tests/custom_sources_cases/mod.rs +++ b/datafusion/core/tests/custom_sources_cases/mod.rs @@ -43,6 +43,7 @@ use datafusion_physical_plan::placeholder_row::PlaceholderRowExec; use datafusion_physical_plan::{ExecutionMode, PlanProperties}; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use futures::stream::Stream; mod provider_filter_pushdown; @@ -80,6 +81,7 @@ struct CustomExecutionPlan { impl CustomExecutionPlan { fn new(projection: Option>) -> Self { let schema = TEST_CUSTOM_SCHEMA_REF!(); + let schema = SchemaRef::new(schema.as_ref().clone().into()); let schema = project_schema(&schema, projection.as_ref()).expect("projected schema"); let cache = Self::compute_properties(schema); @@ -202,8 +204,8 @@ impl TableProvider for CustomTableProvider { self } - fn schema(&self) -> SchemaRef { - TEST_CUSTOM_SCHEMA_REF!() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(TEST_CUSTOM_SCHEMA_REF!().as_ref().clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs index b5506b7c12f6..0362a9e51e96 100644 --- a/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs +++ b/datafusion/core/tests/custom_sources_cases/provider_filter_pushdown.rs @@ -39,6 +39,7 @@ use datafusion_functions_aggregate::expr_fn::count; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; fn create_batch(value: i32, num_rows: usize) -> Result { let mut builder = Int32Builder::with_capacity(num_rows); @@ -152,8 +153,8 @@ impl TableProvider for CustomProvider { self } - fn schema(&self) -> SchemaRef { - self.zero_batch.schema() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.zero_batch.schema().clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/tests/custom_sources_cases/statistics.rs b/datafusion/core/tests/custom_sources_cases/statistics.rs index 2d42b03bfed8..bd13f62dae22 100644 --- a/datafusion/core/tests/custom_sources_cases/statistics.rs +++ b/datafusion/core/tests/custom_sources_cases/statistics.rs @@ -36,6 +36,7 @@ use datafusion_common::{project_schema, stats::Precision}; use datafusion_physical_expr::EquivalenceProperties; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; /// This is a testing structure for statistics /// It will act both as a table provider and execution plan @@ -79,8 +80,8 @@ impl TableProvider for StatisticsValidation { self } - fn schema(&self) -> SchemaRef { - Arc::clone(&self.schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { diff --git a/datafusion/core/tests/dataframe/dataframe_functions.rs b/datafusion/core/tests/dataframe/dataframe_functions.rs index 1c55c48fea40..013b8c9b578d 100644 --- a/datafusion/core/tests/dataframe/dataframe_functions.rs +++ b/datafusion/core/tests/dataframe/dataframe_functions.rs @@ -1052,7 +1052,7 @@ async fn test_fn_decode() -> Result<()> { let expr = decode(encode(col("a"), lit("hex")), lit("hex")) // need to cast to utf8 otherwise the default display of binary array is hex // so it looks like nothing is done - .cast_to(&DataType::Utf8, &df_schema)?; + .cast_to(&DataType::Utf8.into(), &df_schema)?; let expected = [ "+------------------------------------------------+", diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index 2d1904d9e166..9874b01e067d 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -47,6 +47,7 @@ use datafusion::prelude::JoinType; use datafusion::prelude::{CsvReadOptions, ParquetReadOptions}; use datafusion::test_util::{parquet_test_data, populate_csv_partitions}; use datafusion::{assert_batches_eq, assert_batches_sorted_eq}; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{assert_contains, DataFusionError, ScalarValue, UnnestOptions}; use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::RuntimeEnv; @@ -748,8 +749,8 @@ async fn join_with_alias_filter() -> Result<()> { // filter: t1.a + CAST(Int64(1), UInt32) = t2.a + CAST(Int64(2), UInt32) as t1.a + 1 = t2.a + 2 let filter = Expr::eq( - col("t1.a") + lit(3i64).cast_to(&DataType::UInt32, &t1_schema)?, - col("t2.a") + lit(1i32).cast_to(&DataType::UInt32, &t2_schema)?, + col("t1.a") + lit(3i64).cast_to(&DataType::UInt32.into(), &t1_schema)?, + col("t2.a") + lit(1i32).cast_to(&DataType::UInt32.into(), &t2_schema)?, ) .alias("t1.b + 1 = t2.a + 2"); @@ -1927,8 +1928,8 @@ impl VarProvider for HardcodedIntProvider { Ok(ScalarValue::Int64(Some(1234))) } - fn get_type(&self, _: &[String]) -> Option { - Some(DataType::Int64) + fn get_type(&self, _: &[String]) -> Option { + Some(DataType::Int64.into()) } } diff --git a/datafusion/core/tests/expr_api/parse_sql_expr.rs b/datafusion/core/tests/expr_api/parse_sql_expr.rs index 991579b5a350..f2e021f48018 100644 --- a/datafusion/core/tests/expr_api/parse_sql_expr.rs +++ b/datafusion/core/tests/expr_api/parse_sql_expr.rs @@ -15,8 +15,10 @@ // specific language governing permissions and limitations // under the License. -use arrow_schema::{DataType, Field, Schema}; +use arrow_schema::DataType; use datafusion::prelude::{CsvReadOptions, SessionContext}; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use datafusion_common::{DFSchemaRef, Result, ToDFSchema}; use datafusion_expr::Expr; use datafusion_sql::unparser::Unparser; @@ -27,10 +29,10 @@ use datafusion_sql::unparser::Unparser; /// b: Int32 /// s: Float32 fn schema() -> DFSchemaRef { - Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Float32, false), + LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("a", DataType::Int32, true), + LogicalPhysicalField::new("b", DataType::Int32, false), + LogicalPhysicalField::new("c", DataType::Float32, false), ]) .to_dfschema_ref() .unwrap() diff --git a/datafusion/core/tests/expr_api/simplification.rs b/datafusion/core/tests/expr_api/simplification.rs index 9ce47153ba4a..83ce741039d8 100644 --- a/datafusion/core/tests/expr_api/simplification.rs +++ b/datafusion/core/tests/expr_api/simplification.rs @@ -23,6 +23,10 @@ use arrow_buffer::IntervalDayTime; use chrono::{DateTime, TimeZone, Utc}; use datafusion::{error::Result, execution::context::ExecutionProps, prelude::*}; use datafusion_common::cast::as_int32_array; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::ScalarValue; use datafusion_common::{DFSchemaRef, ToDFSchema}; use datafusion_expr::expr::ScalarFunction; @@ -55,8 +59,8 @@ struct MyInfo { impl SimplifyInfo for MyInfo { fn is_boolean_type(&self, expr: &Expr) -> Result { Ok(matches!( - expr.get_type(self.schema.as_ref())?, - DataType::Boolean + expr.get_type(self.schema.as_ref())?.logical(), + LogicalType::Boolean )) } @@ -68,7 +72,7 @@ impl SimplifyInfo for MyInfo { &self.execution_props } - fn get_data_type(&self, expr: &Expr) -> Result { + fn get_data_type(&self, expr: &Expr) -> Result { expr.get_type(self.schema.as_ref()) } } @@ -88,10 +92,10 @@ impl From for MyInfo { /// b: Int32 /// s: Utf8 fn schema() -> DFSchemaRef { - Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Int32, false), - Field::new("s", DataType::Utf8, false), + LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("a", DataType::Int32, true), + LogicalPhysicalField::new("b", DataType::Int32, false), + LogicalPhysicalField::new("s", DataType::Utf8, false), ]) .to_dfschema_ref() .unwrap() @@ -281,7 +285,7 @@ fn select_date_plus_interval() -> Result<()> { let schema = table_scan.schema(); let date_plus_interval_expr = to_timestamp_expr(ts_string) - .cast_to(&DataType::Date32, schema)? + .cast_to(&DataType::Date32.into(), schema)? + Expr::Literal(ScalarValue::IntervalDayTime(Some(IntervalDayTime { days: 123, milliseconds: 0, @@ -483,15 +487,15 @@ fn multiple_now() -> Result<()> { // ------------------------------ fn expr_test_schema() -> DFSchemaRef { - Schema::new(vec![ - Field::new("c1", DataType::Utf8, true), - Field::new("c2", DataType::Boolean, true), - Field::new("c3", DataType::Int64, true), - Field::new("c4", DataType::UInt32, true), - Field::new("c1_non_null", DataType::Utf8, false), - Field::new("c2_non_null", DataType::Boolean, false), - Field::new("c3_non_null", DataType::Int64, false), - Field::new("c4_non_null", DataType::UInt32, false), + LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("c1", DataType::Utf8, true), + LogicalPhysicalField::new("c2", DataType::Boolean, true), + LogicalPhysicalField::new("c3", DataType::Int64, true), + LogicalPhysicalField::new("c4", DataType::UInt32, true), + LogicalPhysicalField::new("c1_non_null", DataType::Utf8, false), + LogicalPhysicalField::new("c2_non_null", DataType::Boolean, false), + LogicalPhysicalField::new("c3_non_null", DataType::Int64, false), + LogicalPhysicalField::new("c4_non_null", DataType::UInt32, false), ]) .to_dfschema_ref() .unwrap() diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index f61ee5d9ab98..01eb235c7403 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -43,6 +43,7 @@ use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; use datafusion_common::{assert_contains, Result}; use datafusion::prelude::{SessionConfig, SessionContext}; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_execution::TaskContext; use test_utils::AccessLogGenerator; @@ -732,8 +733,8 @@ impl TableProvider for SortedTableProvider { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { @@ -748,7 +749,7 @@ impl TableProvider for SortedTableProvider { _limit: Option, ) -> Result> { let mem_exec = - MemoryExec::try_new(&self.batches, self.schema(), projection.cloned())? + MemoryExec::try_new(&self.batches, self.schema.clone(), projection.cloned())? .with_sort_information(self.sort_information.clone()); Ok(Arc::new(mem_exec)) diff --git a/datafusion/core/tests/optimizer_integration.rs b/datafusion/core/tests/optimizer_integration.rs index 39f745cd3309..c955864eb05a 100644 --- a/datafusion/core/tests/optimizer_integration.rs +++ b/datafusion/core/tests/optimizer_integration.rs @@ -25,6 +25,8 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use arrow_schema::{Fields, SchemaBuilder}; use datafusion_common::config::ConfigOptions; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::tree_node::{TransformedResult, TreeNode}; use datafusion_common::{plan_err, DFSchema, Result, ScalarValue}; use datafusion_expr::interval_arithmetic::{Interval, NullableInterval}; @@ -55,8 +57,9 @@ fn init() { #[test] fn select_arrow_cast() { + // TODO(@notfilippo): scalars really need to be LogicalTypes let sql = "SELECT arrow_cast(1234, 'Float64') as f64, arrow_cast('foo', 'LargeUtf8') as large"; - let expected = "Projection: Float64(1234) AS f64, LargeUtf8(\"foo\") AS large\ + let expected = "Projection: Float64(1234) AS f64, Utf8(\"foo\") AS large\ \n EmptyRelation"; quick_test(sql, expected); } @@ -97,7 +100,7 @@ fn concat_literals() -> Result<()> { AS col FROM test"; let expected = - "Projection: concat(Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"falsehello\"), test.col_utf8, Utf8(\"123.4\")) AS col\ + "Projection: concat(Utf8(\"true\"), CAST(test.col_int32 AS TypeRelation(NativeType { logical: Utf8, physical: Utf8 })), Utf8(\"falsehello\"), test.col_utf8, Utf8(\"123.4\")) AS col\ \n TableScan: test projection=[col_int32, col_utf8]"; quick_test(sql, expected); Ok(()) @@ -109,7 +112,7 @@ fn concat_ws_literals() -> Result<()> { AS col FROM test"; let expected = - "Projection: concat_ws(Utf8(\"-\"), Utf8(\"true\"), CAST(test.col_int32 AS Utf8), Utf8(\"false-hello\"), test.col_utf8, Utf8(\"12--3.4\")) AS col\ + "Projection: concat_ws(Utf8(\"-\"), Utf8(\"true\"), CAST(test.col_int32 AS TypeRelation(NativeType { logical: Utf8, physical: Utf8 })), Utf8(\"false-hello\"), test.col_utf8, Utf8(\"12--3.4\")) AS col\ \n TableScan: test projection=[col_int32, col_utf8]"; quick_test(sql, expected); Ok(()) @@ -203,7 +206,7 @@ impl ContextProvider for MyContextProvider { None } - fn get_variable_type(&self, _variable_names: &[String]) -> Option { + fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } @@ -237,8 +240,8 @@ impl TableSource for MyTableSource { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } } @@ -259,7 +262,7 @@ fn test_nested_schema_nullability() { let dfschema = DFSchema::from_field_specific_qualified_schema( vec![Some("table_name".into()), None], - &Arc::new(schema), + &LogicalPhysicalSchemaRef::new(schema.into()), ) .unwrap(); diff --git a/datafusion/core/tests/parquet/page_pruning.rs b/datafusion/core/tests/parquet/page_pruning.rs index 15efd4bcd9dd..a2509d6c311f 100644 --- a/datafusion/core/tests/parquet/page_pruning.rs +++ b/datafusion/core/tests/parquet/page_pruning.rs @@ -32,6 +32,7 @@ use datafusion_expr::execution_props::ExecutionProps; use datafusion_expr::{col, lit, Expr}; use datafusion_physical_expr::create_physical_expr; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use futures::StreamExt; use object_store::path::Path; use object_store::ObjectMeta; @@ -66,7 +67,9 @@ async fn get_parquet_exec(state: &SessionState, filter: Expr) -> ParquetExec { extensions: None, }; - let df_schema = schema.clone().to_dfschema().unwrap(); + let df_schema = LogicalPhysicalSchema::from(schema.as_ref().clone()) + .to_dfschema() + .unwrap(); let execution_props = ExecutionProps::new(); let predicate = create_physical_expr(&filter, &df_schema, &execution_props).unwrap(); diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index 2174009b8557..f609288f3539 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -15,11 +15,11 @@ // specific language governing permissions and limitations // under the License. +use super::*; use datafusion::execution::context::SessionState; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion::test_util::TestTableFactory; - -use super::*; +use datafusion_common::logical_type::TypeRelation; #[tokio::test] async fn create_custom_table() -> Result<()> { @@ -67,9 +67,18 @@ async fn create_external_table_with_ddl() -> Result<()> { assert_eq!(3, table_schema.fields().len()); - assert_eq!(&DataType::Int32, table_schema.field(0).data_type()); - assert_eq!(&DataType::Utf8, table_schema.field(1).data_type()); - assert_eq!(&DataType::Boolean, table_schema.field(2).data_type()); + assert_eq!( + &DataType::Int32, + table_schema.field(0).data_type().physical() + ); + assert_eq!( + &DataType::Utf8, + table_schema.field(1).data_type().physical() + ); + assert_eq!( + &DataType::Boolean, + table_schema.field(2).data_type().physical() + ); Ok(()) } diff --git a/datafusion/core/tests/sql/select.rs b/datafusion/core/tests/sql/select.rs index d9ef462df26c..72364e99de06 100644 --- a/datafusion/core/tests/sql/select.rs +++ b/datafusion/core/tests/sql/select.rs @@ -192,7 +192,7 @@ async fn prepared_statement_invalid_types() -> Result<()> { .with_param_values(vec![ScalarValue::from("1")]); assert_eq!( results.unwrap_err().strip_backtrace(), - "Error during planning: Expected parameter of type Int32, got Utf8 at index 0" + "Error during planning: Expected parameter of type TypeRelation(NativeType { logical: Int32, physical: Int32 }), got Utf8 at index 0" ); Ok(()) } diff --git a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs index 1733068debb9..45cbf4a8e4a8 100644 --- a/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_scalar_functions.rs @@ -25,6 +25,7 @@ use datafusion::execution::context::{FunctionFactory, RegisterFunction, SessionS use datafusion::prelude::*; use datafusion::{execution::registry::FunctionRegistry, test_util}; use datafusion_common::cast::{as_float64_array, as_int32_array}; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::tree_node::{Transformed, TreeNode}; use datafusion_common::{ assert_batches_eq, assert_batches_sorted_eq, assert_contains, exec_err, internal_err, @@ -518,14 +519,14 @@ impl ScalarUDFImpl for CastToI64UDF { // SimplifyInfo so we have to replicate some of the casting logic here. let source_type = info.get_data_type(&arg)?; - let new_expr = if source_type == DataType::Int64 { + let new_expr = if source_type.physical() == &DataType::Int64 { // the argument's data type is already the correct type arg } else { // need to use an actual cast to get the correct type Expr::Cast(datafusion_expr::Cast { expr: Box::new(arg), - data_type: DataType::Int64, + data_type: DataType::Int64.into(), }) }; // return the newly written argument to DataFusion @@ -656,7 +657,11 @@ impl ScalarUDFImpl for TakeUDF { ); }; - arg_exprs.get(take_idx).unwrap().get_type(schema) + arg_exprs + .get(take_idx) + .unwrap() + .get_type(schema) + .map(|t| t.physical().clone()) } // The actual implementation @@ -698,8 +703,8 @@ async fn verify_udf_return_type() -> Result<()> { // The output schema should be // * type of column smallint_col (int32) // * type of column double_col (float64) - assert_eq!(schema.field(0).data_type(), &DataType::Int32); - assert_eq!(schema.field(1).data_type(), &DataType::Float64); + assert_eq!(schema.field(0).data_type().physical(), &DataType::Int32); + assert_eq!(schema.field(1).data_type().physical(), &DataType::Float64); let expected = [ "+-------+-------+", @@ -846,13 +851,17 @@ impl TryFrom for ScalarFunctionWrapper { .expect("Expression has to be defined!"), return_type: definition .return_type - .expect("Return type has to be defined!"), + .expect("Return type has to be defined!") + .physical() + .clone(), + // TODO(@notfilippo): avoid conversion to physical type signature: Signature::exact( definition .args .unwrap_or_default() .into_iter() - .map(|a| a.data_type) + // TODO(@notfilippo): avoid conversion to physical type + .map(|a| a.data_type.physical().clone()) .collect(), definition .params @@ -1001,10 +1010,10 @@ async fn create_scalar_function_from_sql_statement_postgres_syntax() -> Result<( value: "name".into(), quote_style: None, }), - data_type: DataType::Utf8, + data_type: DataType::Utf8.into(), default_expr: None, }]), - return_type: Some(DataType::Int32), + return_type: Some(DataType::Int32.into()), params: CreateFunctionBody { language: Some(Ident { value: "plrust".into(), diff --git a/datafusion/core/tests/user_defined/user_defined_table_functions.rs b/datafusion/core/tests/user_defined/user_defined_table_functions.rs index 1e8d30cab638..10e76f6cd872 100644 --- a/datafusion/core/tests/user_defined/user_defined_table_functions.rs +++ b/datafusion/core/tests/user_defined/user_defined_table_functions.rs @@ -29,6 +29,7 @@ use datafusion::execution::TaskContext; use datafusion::physical_plan::memory::MemoryExec; use datafusion::physical_plan::{collect, ExecutionPlan}; use datafusion::prelude::SessionContext; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{assert_batches_eq, DFSchema, ScalarValue}; use datafusion_expr::{EmptyRelation, Expr, LogicalPlan, Projection, TableType}; use std::fs::File; @@ -117,8 +118,8 @@ impl TableProvider for SimpleCsvTable { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } fn table_type(&self) -> TableType { @@ -154,7 +155,7 @@ impl TableProvider for SimpleCsvTable { }; Ok(Arc::new(MemoryExec::try_new( &[batches], - TableProvider::schema(self), + self.schema.clone(), projection.cloned(), )?)) } diff --git a/datafusion/expr/src/conditional_expressions.rs b/datafusion/expr/src/conditional_expressions.rs index 7a2bf4b6c44a..02aac54b23f1 100644 --- a/datafusion/expr/src/conditional_expressions.rs +++ b/datafusion/expr/src/conditional_expressions.rs @@ -19,6 +19,7 @@ use crate::expr::Case; use crate::{expr_schema::ExprSchemable, Expr}; use arrow::datatypes::DataType; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{plan_err, DFSchema, Result}; use std::collections::HashSet; @@ -70,18 +71,18 @@ impl CaseBuilder { then_expr.push(e.as_ref().to_owned()); } - let then_types: Vec = then_expr + let then_types: Vec = then_expr .iter() .map(|e| match e { Expr::Literal(_) => e.get_type(&DFSchema::empty()), - _ => Ok(DataType::Null), + _ => Ok(DataType::Null.into()), }) .collect::>>()?; - if then_types.contains(&DataType::Null) { + if then_types.contains(&DataType::Null.into()) { // cannot verify types until execution type } else { - let unique_types: HashSet<&DataType> = then_types.iter().collect(); + let unique_types: HashSet<&LogicalPhysicalType> = then_types.iter().collect(); if unique_types.len() != 1 { return plan_err!( "CASE expression 'then' values had multiple data types: {unique_types:?}" diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ecece6dbfce7..46878fd2c489 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -33,7 +33,9 @@ use crate::{ }; use crate::{window_frame, Volatility}; -use arrow::datatypes::{DataType, FieldRef}; +use arrow::datatypes::DataType; +use datafusion_common::logical_type::field::LogicalPhysicalFieldRef; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; @@ -153,7 +155,7 @@ use sqlparser::ast::NullTreatment; /// Field::new("c2", DataType::Float64, false), /// ]); /// // DFSchema is a an Arrow schema with optional relation name -/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema) +/// let df_schema = DFSchema::try_from_qualified_schema("t1", &arrow_schema.into()) /// .unwrap(); /// /// // Form Vec with an expression for each column in the schema @@ -223,7 +225,7 @@ pub enum Expr { /// A named reference to a qualified filed in a schema. Column(Column), /// A named reference to a variable in a registry. - ScalarVariable(DataType, Vec), + ScalarVariable(LogicalPhysicalType, Vec), /// A constant value. Literal(ScalarValue), /// A binary expression such as "age > 21" @@ -317,7 +319,7 @@ pub enum Expr { Placeholder(Placeholder), /// A place holder which hold a reference to a qualified field /// in the outer query, used for correlated sub queries. - OuterReferenceColumn(DataType, Column), + OuterReferenceColumn(LogicalPhysicalType, Column), /// Unnest expression Unnest(Unnest), } @@ -339,8 +341,8 @@ impl From for Expr { /// useful for creating [`Expr`] from a [`DFSchema`]. /// /// See example on [`Expr`] -impl<'a> From<(Option<&'a TableReference>, &'a FieldRef)> for Expr { - fn from(value: (Option<&'a TableReference>, &'a FieldRef)) -> Self { +impl<'a> From<(Option<&'a TableReference>, &'a LogicalPhysicalFieldRef)> for Expr { + fn from(value: (Option<&'a TableReference>, &'a LogicalPhysicalFieldRef)) -> Self { Expr::from(Column::from(value)) } } @@ -563,14 +565,17 @@ pub enum GetFieldAccess { pub struct Cast { /// The expression being cast pub expr: Box, - /// The `DataType` the expression will yield - pub data_type: DataType, + /// The `LogicalType` the expression will yield + pub data_type: LogicalPhysicalType, } impl Cast { /// Create a new Cast expression - pub fn new(expr: Box, data_type: DataType) -> Self { - Self { expr, data_type } + pub fn new(expr: Box, data_type: impl Into) -> Self { + Self { + expr, + data_type: data_type.into(), + } } } @@ -579,14 +584,17 @@ impl Cast { pub struct TryCast { /// The expression being cast pub expr: Box, - /// The `DataType` the expression will yield - pub data_type: DataType, + /// The `LogicalType` the expression will yield + pub data_type: LogicalPhysicalType, } impl TryCast { /// Create a new TryCast expression - pub fn new(expr: Box, data_type: DataType) -> Self { - Self { expr, data_type } + pub fn new(expr: Box, data_type: impl Into) -> Self { + Self { + expr, + data_type: data_type.into(), + } } } @@ -931,12 +939,12 @@ pub struct Placeholder { /// The identifier of the parameter, including the leading `$` (e.g, `"$1"` or `"$foo"`) pub id: String, /// The type the parameter will be filled in with - pub data_type: Option, + pub data_type: Option, } impl Placeholder { /// Create a new Placeholder expression - pub fn new(id: String, data_type: Option) -> Self { + pub fn new(id: String, data_type: Option) -> Self { Self { id, data_type } } } @@ -2093,7 +2101,7 @@ fn write_name(w: &mut W, e: &Expr) -> Result<()> { Expr::InSubquery(InSubquery { negated: true, .. }) => w.write_str("NOT IN")?, Expr::InSubquery(InSubquery { negated: false, .. }) => w.write_str("IN")?, Expr::ScalarSubquery(subquery) => { - w.write_str(subquery.subquery.schema().field(0).name().as_str())?; + w.write_str(subquery.subquery.schema().field(0).name())?; } Expr::Unnest(Unnest { expr }) => { w.write_str("unnest(")?; @@ -2255,7 +2263,7 @@ mod test { fn format_cast() -> Result<()> { let expr = Expr::Cast(Cast { expr: Box::new(Expr::Literal(ScalarValue::Float32(Some(1.23)))), - data_type: DataType::Utf8, + data_type: DataType::Utf8.into(), }); let expected_canonical = "CAST(Float32(1.23) AS Utf8)"; assert_eq!(expected_canonical, expr.canonical_name()); diff --git a/datafusion/expr/src/expr_fn.rs b/datafusion/expr/src/expr_fn.rs index 8b0213fd52fd..02167cab30c6 100644 --- a/datafusion/expr/src/expr_fn.rs +++ b/datafusion/expr/src/expr_fn.rs @@ -35,6 +35,7 @@ use arrow::compute::kernels::cast_utils::{ parse_interval_day_time, parse_interval_month_day_nano, parse_interval_year_month, }; use arrow::datatypes::{DataType, Field}; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{Column, Result, ScalarValue}; use std::any::Any; use std::fmt::Debug; @@ -62,8 +63,8 @@ pub fn col(ident: impl Into) -> Expr { /// Create an out reference column which hold a reference that has been resolved to a field /// outside of the current plan. -pub fn out_ref_col(dt: DataType, ident: impl Into) -> Expr { - Expr::OuterReferenceColumn(dt, ident.into()) +pub fn out_ref_col(dt: impl Into, ident: impl Into) -> Expr { + Expr::OuterReferenceColumn(dt.into(), ident.into()) } /// Create an unqualified column expression from the provided name, without normalizing @@ -308,13 +309,13 @@ pub fn rollup(exprs: Vec) -> Expr { } /// Create a cast expression -pub fn cast(expr: Expr, data_type: DataType) -> Expr { - Expr::Cast(Cast::new(Box::new(expr), data_type)) +pub fn cast(expr: Expr, data_type: impl Into) -> Expr { + Expr::Cast(Cast::new(Box::new(expr), data_type.into())) } /// Create a try cast expression -pub fn try_cast(expr: Expr, data_type: DataType) -> Expr { - Expr::TryCast(TryCast::new(Box::new(expr), data_type)) +pub fn try_cast(expr: Expr, data_type: impl Into) -> Expr { + Expr::TryCast(TryCast::new(Box::new(expr), data_type.into())) } /// Create is null expression diff --git a/datafusion/expr/src/expr_rewriter/mod.rs b/datafusion/expr/src/expr_rewriter/mod.rs index 91bec501f4a0..8827ac955999 100644 --- a/datafusion/expr/src/expr_rewriter/mod.rs +++ b/datafusion/expr/src/expr_rewriter/mod.rs @@ -287,7 +287,9 @@ mod test { use super::*; use crate::expr::Sort; use crate::{col, lit, Cast}; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow::datatypes::DataType; + use datafusion_common::logical_type::field::LogicalPhysicalField; + use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use datafusion_common::ScalarValue; #[derive(Default)] @@ -408,10 +410,11 @@ mod test { ) -> DFSchema { let fields = fields .iter() - .map(|f| Arc::new(Field::new(f.to_string(), DataType::Int8, false))) + .map(|f| Arc::new(LogicalPhysicalField::new(f.to_string(), DataType::Int8, false))) .collect::>(); - let schema = Arc::new(Schema::new(fields)); - DFSchema::from_field_specific_qualified_schema(qualifiers, &schema).unwrap() + let schema = Arc::new(LogicalPhysicalSchema::new(fields)); + DFSchema::from_field_specific_qualified_schema(qualifiers, &schema.into()) + .unwrap() } #[test] diff --git a/datafusion/expr/src/expr_rewriter/order_by.rs b/datafusion/expr/src/expr_rewriter/order_by.rs index 4b56ca3d1c2e..994053c98c3e 100644 --- a/datafusion/expr/src/expr_rewriter/order_by.rs +++ b/datafusion/expr/src/expr_rewriter/order_by.rs @@ -153,12 +153,11 @@ mod test { use std::ops::Add; use std::sync::Arc; - use arrow::datatypes::{DataType, Field, Schema}; - use crate::{ cast, col, lit, logical_plan::builder::LogicalTableSource, min, test::function_stub::avg, try_cast, LogicalPlanBuilder, }; + use arrow::datatypes::{DataType, Field, Schema}; use super::*; diff --git a/datafusion/expr/src/expr_schema.rs b/datafusion/expr/src/expr_schema.rs index 1df5d6c4d736..7be6ce38e20f 100644 --- a/datafusion/expr/src/expr_schema.rs +++ b/datafusion/expr/src/expr_schema.rs @@ -26,7 +26,10 @@ use crate::type_coercion::functions::{ }; use crate::{utils, LogicalPlan, Projection, Subquery, WindowFunctionDefinition}; use arrow::compute::can_cast_types; -use arrow::datatypes::{DataType, Field}; +use arrow::datatypes::DataType; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::{ internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, ExprSchema, Result, TableReference, @@ -37,7 +40,7 @@ use std::sync::Arc; /// trait to allow expr to typable with respect to a schema pub trait ExprSchemable { /// given a schema, return the type of the expr - fn get_type(&self, schema: &dyn ExprSchema) -> Result; + fn get_type(&self, schema: &dyn ExprSchema) -> Result; /// given a schema, return the nullability of the expr fn nullable(&self, input_schema: &dyn ExprSchema) -> Result; @@ -49,14 +52,20 @@ pub trait ExprSchemable { fn to_field( &self, input_schema: &dyn ExprSchema, - ) -> Result<(Option, Arc)>; + ) -> Result<(Option, Arc)>; /// cast to a type with respect to a schema - fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result; + fn cast_to( + self, + cast_to_type: &LogicalPhysicalType, + schema: &dyn ExprSchema, + ) -> Result; /// given a schema, return the type and nullability of the expr - fn data_type_and_nullable(&self, schema: &dyn ExprSchema) - -> Result<(DataType, bool)>; + fn data_type_and_nullable( + &self, + schema: &dyn ExprSchema, + ) -> Result<(LogicalPhysicalType, bool)>; } impl ExprSchemable for Expr { @@ -98,7 +107,7 @@ impl ExprSchemable for Expr { /// expression refers to a column that does not exist in the /// schema, or when the expression is incorrectly typed /// (e.g. `[utf8] + [bool]`). - fn get_type(&self, schema: &dyn ExprSchema) -> Result { + fn get_type(&self, schema: &dyn ExprSchema) -> Result { match self { Expr::Alias(Alias { expr, name, .. }) => match &**expr { Expr::Placeholder(Placeholder { data_type, .. }) => match &data_type { @@ -111,19 +120,17 @@ impl ExprSchemable for Expr { Expr::Column(c) => Ok(schema.data_type(c)?.clone()), Expr::OuterReferenceColumn(ty, _) => Ok(ty.clone()), Expr::ScalarVariable(ty, _) => Ok(ty.clone()), - Expr::Literal(l) => Ok(l.data_type()), + Expr::Literal(l) => Ok(l.data_type().into()), Expr::Case(case) => case.when_then_expr[0].1.get_type(schema), Expr::Cast(Cast { data_type, .. }) | Expr::TryCast(TryCast { data_type, .. }) => Ok(data_type.clone()), Expr::Unnest(Unnest { expr }) => { let arg_data_type = expr.get_type(schema)?; // Unnest's output type is the inner type of the list - match arg_data_type { - DataType::List(field) - | DataType::LargeList(field) - | DataType::FixedSizeList(field, _) => Ok(field.data_type().clone()), - DataType::Struct(_) => Ok(arg_data_type), - DataType::Null => { + match arg_data_type.logical() { + LogicalType::List(field) => Ok(field.data_type().clone()), + LogicalType::Struct(_) => Ok(arg_data_type), + LogicalType::Null => { not_impl_err!("unnest() does not support null yet") } _ => { @@ -138,6 +145,13 @@ impl ExprSchemable for Expr { .iter() .map(|e| e.get_type(schema)) .collect::>>()?; + + // TODO(@notfilippo): not convert to DataType + let arg_data_types = arg_data_types + .into_iter() + .map(|e| e.physical().clone()) + .collect::>(); + // verify that function is invoked with correct number and type of arguments as defined in `TypeSignature` data_types_with_scalar_udf(&arg_data_types, func).map_err(|err| { plan_datafusion_err!( @@ -153,13 +167,20 @@ impl ExprSchemable for Expr { // perform additional function arguments validation (due to limited // expressiveness of `TypeSignature`), then infer return type - Ok(func.return_type_from_exprs(args, schema, &arg_data_types)?) + Ok(func + .return_type_from_exprs(args, schema, &arg_data_types)? + .into()) } Expr::WindowFunction(WindowFunction { fun, args, .. }) => { let data_types = args .iter() .map(|e| e.get_type(schema)) .collect::>>()?; + // TODO(@notfilippo): not convert to DataType + let data_types = data_types + .into_iter() + .map(|e| e.physical().clone()) + .collect::>(); let nullability = args .iter() .map(|e| e.nullable(schema)) @@ -178,9 +199,9 @@ impl ExprSchemable for Expr { ) ) })?; - Ok(fun.return_type(&new_types, &nullability)?) + Ok(fun.return_type(&new_types, &nullability)?.into()) } - _ => fun.return_type(&data_types, &nullability), + _ => Ok(fun.return_type(&data_types, &nullability)?.into()), } } Expr::AggregateFunction(AggregateFunction { func_def, args, .. }) => { @@ -192,9 +213,14 @@ impl ExprSchemable for Expr { .iter() .map(|e| e.nullable(schema)) .collect::>>()?; + // TODO(@notfilippo): not convert to DataType + let data_types = data_types + .into_iter() + .map(|e| e.physical().clone()) + .collect::>(); match func_def { AggregateFunctionDefinition::BuiltIn(fun) => { - fun.return_type(&data_types, &nullability) + Ok(fun.return_type(&data_types, &nullability)?.into()) } AggregateFunctionDefinition::UDF(fun) => { let new_types = data_types_with_aggregate_udf(&data_types, fun) @@ -209,7 +235,7 @@ impl ExprSchemable for Expr { ) ) })?; - Ok(fun.return_type(&new_types)?) + Ok(fun.return_type(&new_types)?.into()) } } } @@ -225,7 +251,7 @@ impl ExprSchemable for Expr { | Expr::IsUnknown(_) | Expr::IsNotTrue(_) | Expr::IsNotFalse(_) - | Expr::IsNotUnknown(_) => Ok(DataType::Boolean), + | Expr::IsNotUnknown(_) => Ok(DataType::Boolean.into()), Expr::ScalarSubquery(subquery) => { Ok(subquery.subquery.schema().field(0).data_type().clone()) } @@ -233,8 +259,14 @@ impl ExprSchemable for Expr { ref left, ref right, ref op, - }) => get_result_type(&left.get_type(schema)?, op, &right.get_type(schema)?), - Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean), + // TODO(@notfilippo): do not convert to physical type + }) => Ok(get_result_type( + &left.get_type(schema)?.physical(), + op, + &right.get_type(schema)?.physical(), + )? + .into()), + Expr::Like { .. } | Expr::SimilarTo { .. } => Ok(DataType::Boolean.into()), Expr::Placeholder(Placeholder { data_type, .. }) => { data_type.clone().ok_or_else(|| { plan_datafusion_err!( @@ -248,12 +280,12 @@ impl ExprSchemable for Expr { // Wildcard do not really have a type and do not appear in projections match qualifier { Some(_) => internal_err!("QualifiedWildcard expressions are not valid in a logical query plan"), - None => Ok(DataType::Null) + None => Ok(DataType::Null.into()) } } Expr::GroupingSet(_) => { // grouping sets do not really have a type and do not appear in projections - Ok(DataType::Null) + Ok(DataType::Null.into()) } } } @@ -399,7 +431,7 @@ impl ExprSchemable for Expr { fn data_type_and_nullable( &self, schema: &dyn ExprSchema, - ) -> Result<(DataType, bool)> { + ) -> Result<(LogicalPhysicalType, bool)> { match self { Expr::Alias(Alias { expr, name, .. }) => match &**expr { Expr::Placeholder(Placeholder { data_type, .. }) => match &data_type { @@ -418,7 +450,7 @@ impl ExprSchemable for Expr { .map(|(d, n)| (d.clone(), n)), Expr::OuterReferenceColumn(ty, _) => Ok((ty.clone(), true)), Expr::ScalarVariable(ty, _) => Ok((ty.clone(), true)), - Expr::Literal(l) => Ok((l.data_type(), l.is_null())), + Expr::Literal(l) => Ok((l.data_type().into(), l.is_null())), Expr::IsNull(_) | Expr::IsNotNull(_) | Expr::IsTrue(_) @@ -427,7 +459,7 @@ impl ExprSchemable for Expr { | Expr::IsNotTrue(_) | Expr::IsNotFalse(_) | Expr::IsNotUnknown(_) - | Expr::Exists { .. } => Ok((DataType::Boolean, false)), + | Expr::Exists { .. } => Ok((DataType::Boolean.into(), false)), Expr::ScalarSubquery(subquery) => Ok(( subquery.subquery.schema().field(0).data_type().clone(), subquery.subquery.schema().field(0).is_nullable(), @@ -439,7 +471,11 @@ impl ExprSchemable for Expr { }) => { let left = left.data_type_and_nullable(schema)?; let right = right.data_type_and_nullable(schema)?; - Ok((get_result_type(&left.0, op, &right.0)?, left.1 || right.1)) + // TODO(@notfilippo): do not convert to physical type + Ok(( + get_result_type(&left.0.physical(), op, &right.0.physical())?.into(), + left.1 || right.1, + )) } _ => Ok((self.get_type(schema)?, self.nullable(schema)?)), } @@ -452,13 +488,13 @@ impl ExprSchemable for Expr { fn to_field( &self, input_schema: &dyn ExprSchema, - ) -> Result<(Option, Arc)> { + ) -> Result<(Option, Arc)> { match self { Expr::Column(c) => { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( c.relation.clone(), - Field::new(&c.name, data_type, nullable) + LogicalPhysicalField::new(&c.name, data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) @@ -467,7 +503,7 @@ impl ExprSchemable for Expr { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( relation.clone(), - Field::new(name, data_type, nullable) + LogicalPhysicalField::new(name, data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) @@ -476,7 +512,7 @@ impl ExprSchemable for Expr { let (data_type, nullable) = self.data_type_and_nullable(input_schema)?; Ok(( None, - Field::new(self.display_name()?, data_type, nullable) + LogicalPhysicalField::new(self.display_name()?, data_type, nullable) .with_metadata(self.metadata(input_schema)?) .into(), )) @@ -490,7 +526,11 @@ impl ExprSchemable for Expr { /// /// This function errors when it is impossible to cast the /// expression to the target [arrow::datatypes::DataType]. - fn cast_to(self, cast_to_type: &DataType, schema: &dyn ExprSchema) -> Result { + fn cast_to( + self, + cast_to_type: &LogicalPhysicalType, + schema: &dyn ExprSchema, + ) -> Result { let this_type = self.get_type(schema)?; if this_type == *cast_to_type { return Ok(self); @@ -500,7 +540,8 @@ impl ExprSchemable for Expr { // like all of the binary expressions below. Perhaps Expr should track the // type of the expression? - if can_cast_types(&this_type, cast_to_type) { + // TODO(@notfilippo): The basis for whether cast can be executed should be the logical type + if can_cast_types(&this_type.physical(), &cast_to_type.physical()) { match self { Expr::ScalarSubquery(subquery) => { Ok(Expr::ScalarSubquery(cast_subquery(subquery, cast_to_type)?)) @@ -514,7 +555,10 @@ impl ExprSchemable for Expr { } /// cast subquery in InSubquery/ScalarSubquery to a given type. -pub fn cast_subquery(subquery: Subquery, cast_to_type: &DataType) -> Result { +pub fn cast_subquery( + subquery: Subquery, + cast_to_type: &LogicalPhysicalType, +) -> Result { if subquery.subquery.schema().field(0).data_type() == cast_to_type { return Ok(subquery); } @@ -581,7 +625,7 @@ mod tests { fn test_between_nullability() { let get_schema = |nullable| { MockExprSchema::new() - .with_data_type(DataType::Int32) + .with_data_type(DataType::Int32.into()) .with_nullable(nullable) }; @@ -605,7 +649,7 @@ mod tests { fn test_inlist_nullability() { let get_schema = |nullable| { MockExprSchema::new() - .with_data_type(DataType::Int32) + .with_data_type(DataType::Int32.into()) .with_nullable(nullable) }; @@ -630,7 +674,7 @@ mod tests { fn test_like_nullability() { let get_schema = |nullable| { MockExprSchema::new() - .with_data_type(DataType::Utf8) + .with_data_type(DataType::Utf8.into()) .with_nullable(nullable) }; @@ -646,9 +690,10 @@ mod tests { fn expr_schema_data_type() { let expr = col("foo"); assert_eq!( - DataType::Utf8, - expr.get_type(&MockExprSchema::new().with_data_type(DataType::Utf8)) + &LogicalType::Utf8, + expr.get_type(&MockExprSchema::new().with_data_type(DataType::Utf8.into())) .unwrap() + .logical() ); } @@ -658,7 +703,7 @@ mod tests { meta.insert("bar".to_string(), "buzz".to_string()); let expr = col("foo"); let schema = MockExprSchema::new() - .with_data_type(DataType::Int32) + .with_data_type(DataType::Int32.into()) .with_metadata(meta.clone()); // col and alias should be metadata-preserving @@ -669,15 +714,16 @@ mod tests { assert_eq!( HashMap::new(), expr.clone() - .cast_to(&DataType::Int64, &schema) + .cast_to(&DataType::Int64.into(), &schema) .unwrap() .metadata(&schema) .unwrap() ); let schema = DFSchema::from_unqualified_fields( - vec![Field::new("foo", DataType::Int32, true).with_metadata(meta.clone())] - .into(), + vec![LogicalPhysicalField::new("foo", DataType::Int32, true) + .with_metadata(meta.clone())] + .into(), HashMap::new(), ) .unwrap(); @@ -689,7 +735,7 @@ mod tests { #[derive(Debug)] struct MockExprSchema { nullable: bool, - data_type: DataType, + data_type: LogicalPhysicalType, error_on_nullable: bool, metadata: HashMap, } @@ -698,7 +744,7 @@ mod tests { fn new() -> Self { Self { nullable: false, - data_type: DataType::Null, + data_type: DataType::Null.into(), error_on_nullable: false, metadata: HashMap::new(), } @@ -709,7 +755,7 @@ mod tests { self } - fn with_data_type(mut self, data_type: DataType) -> Self { + fn with_data_type(mut self, data_type: LogicalPhysicalType) -> Self { self.data_type = data_type; self } @@ -734,7 +780,7 @@ mod tests { } } - fn data_type(&self, _col: &Column) -> Result<&DataType> { + fn data_type(&self, _col: &Column) -> Result<&LogicalPhysicalType> { Ok(&self.data_type) } @@ -742,7 +788,7 @@ mod tests { Ok(&self.metadata) } - fn data_type_and_nullable(&self, col: &Column) -> Result<(&DataType, bool)> { + fn data_type_and_nullable(&self, col: &Column) -> Result<(&LogicalPhysicalType, bool)> { Ok((self.data_type(col)?, self.nullable(col)?)) } } diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 4ad3bd5018a4..f28efd43affc 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -48,9 +48,14 @@ use crate::{ WriteOp, }; -use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Schema, SchemaRef}; use datafusion_common::display::ToStringifiedPlan; use datafusion_common::file_options::file_type::FileType; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::fields::LogicalPhysicalFields; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::{ get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, @@ -182,32 +187,34 @@ impl LogicalPlanBuilder { } let empty_schema = DFSchema::empty(); - let mut field_types: Vec = Vec::with_capacity(n_cols); + let mut field_types: Vec = Vec::with_capacity(n_cols); for j in 0..n_cols { - let mut common_type: Option = None; + let mut common_type: Option = None; for (i, row) in values.iter().enumerate() { let value = &row[j]; let data_type = value.get_type(&empty_schema)?; - if data_type == DataType::Null { + if *data_type.logical() == LogicalType::Null { continue; } if let Some(prev_type) = common_type { // get common type of each column values. - let Some(new_type) = values_coercion(&data_type, &prev_type) else { + let Some(new_type) = + values_coercion(&data_type.physical(), &prev_type.physical()) + else { return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"); }; - common_type = Some(new_type); + common_type = Some(new_type.into()); } else { common_type = Some(data_type.clone()); } } - field_types.push(common_type.unwrap_or(DataType::Utf8)); + field_types.push(common_type.unwrap_or(DataType::Utf8.into())); } // wrap cast if data type is not same as common type. for row in &mut values { for (j, field_type) in field_types.iter().enumerate() { if let Expr::Literal(ScalarValue::Null) = row[j] { - row[j] = Expr::Literal(ScalarValue::try_from(field_type.clone())?); + row[j] = Expr::Literal(ScalarValue::try_from(field_type.physical())?); } else { row[j] = std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?; @@ -220,7 +227,7 @@ impl LogicalPlanBuilder { .map(|(j, data_type)| { // naming is following convention https://www.postgresql.org/docs/current/queries-values.html let name = &format!("column{}", j + 1); - Field::new(name, data_type.clone(), true) + LogicalPhysicalField::new(name, data_type.clone(), true) }) .collect::>(); let dfschema = DFSchema::from_unqualified_fields(fields.into(), HashMap::new())?; @@ -289,7 +296,7 @@ impl LogicalPlanBuilder { pub fn insert_into( input: LogicalPlan, table_name: impl Into, - table_schema: &Schema, + table_schema: &LogicalPhysicalSchema, overwrite: bool, ) -> Result { let table_schema = table_schema.clone().to_dfschema_ref()?; @@ -383,7 +390,7 @@ impl LogicalPlanBuilder { } /// Make a builder for a prepare logical plan from the builder's plan - pub fn prepare(self, name: String, data_types: Vec) -> Result { + pub fn prepare(self, name: String, data_types: Vec) -> Result { Ok(Self::from(LogicalPlan::Prepare(Prepare { name, data_types, @@ -1181,7 +1188,7 @@ impl From> for LogicalPlanBuilder { } } -pub fn change_redundant_column(fields: &Fields) -> Vec { +pub fn change_redundant_column(fields: &LogicalPhysicalFields) -> Vec { let mut name_map = HashMap::new(); fields .into_iter() @@ -1190,7 +1197,11 @@ pub fn change_redundant_column(fields: &Fields) -> Vec { *counter += 1; if *counter > 1 { let new_name = format!("{}:{}", field.name(), *counter - 1); - Field::new(new_name, field.data_type().clone(), field.is_nullable()) + LogicalPhysicalField::new( + new_name, + field.data_type().clone(), + field.is_nullable(), + ) } else { field.as_ref().clone() } @@ -1205,8 +1216,8 @@ pub fn build_join_schema( join_type: &JoinType, ) -> Result { fn nullify_fields<'a>( - fields: impl Iterator, &'a Arc)>, - ) -> Vec<(Option, Arc)> { + fields: impl Iterator, &'a Arc)>, + ) -> Vec<(Option, Arc)> { fields .map(|(q, f)| { // TODO: find a good way to do that @@ -1219,57 +1230,58 @@ pub fn build_join_schema( let right_fields = right.iter(); let left_fields = left.iter(); - let qualified_fields: Vec<(Option, Arc)> = match join_type { - JoinType::Inner => { - // left then right - let left_fields = left_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect::>(); - let right_fields = right_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect::>(); - left_fields.into_iter().chain(right_fields).collect() - } - JoinType::Left => { - // left then right, right set to nullable in case of not matched scenario - let left_fields = left_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect::>(); - left_fields - .into_iter() - .chain(nullify_fields(right_fields)) - .collect() - } - JoinType::Right => { - // left then right, left set to nullable in case of not matched scenario - let right_fields = right_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect::>(); - nullify_fields(left_fields) - .into_iter() - .chain(right_fields) - .collect() - } - JoinType::Full => { - // left then right, all set to nullable in case of not matched scenario - nullify_fields(left_fields) - .into_iter() - .chain(nullify_fields(right_fields)) - .collect() - } - JoinType::LeftSemi | JoinType::LeftAnti => { - // Only use the left side for the schema - left_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect() - } - JoinType::RightSemi | JoinType::RightAnti => { - // Only use the right side for the schema - right_fields - .map(|(q, f)| (q.cloned(), Arc::clone(f))) - .collect() - } - }; + let qualified_fields: Vec<(Option, Arc)> = + match join_type { + JoinType::Inner => { + // left then right + let left_fields = left_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect::>(); + let right_fields = right_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect::>(); + left_fields.into_iter().chain(right_fields).collect() + } + JoinType::Left => { + // left then right, right set to nullable in case of not matched scenario + let left_fields = left_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect::>(); + left_fields + .into_iter() + .chain(nullify_fields(right_fields)) + .collect() + } + JoinType::Right => { + // left then right, left set to nullable in case of not matched scenario + let right_fields = right_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect::>(); + nullify_fields(left_fields) + .into_iter() + .chain(right_fields) + .collect() + } + JoinType::Full => { + // left then right, all set to nullable in case of not matched scenario + nullify_fields(left_fields) + .into_iter() + .chain(nullify_fields(right_fields)) + .collect() + } + JoinType::LeftSemi | JoinType::LeftAnti => { + // Only use the left side for the schema + left_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect() + } + JoinType::RightSemi | JoinType::RightAnti => { + // Only use the right side for the schema + right_fields + .map(|(q, f)| (q.cloned(), Arc::clone(f))) + .collect() + } + }; let func_dependencies = left.functional_dependencies().join( right.functional_dependencies(), join_type, @@ -1381,9 +1393,10 @@ pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result Result SchemaRef { - Arc::clone(&self.table_schema) + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.table_schema.clone().into()) } fn supports_filters_pushdown( @@ -1605,15 +1622,13 @@ pub fn unnest(input: LogicalPlan, columns: Vec) -> Result { // - Struct(field1, field2) returns ["a.field1","a.field2"] pub fn get_unnested_columns( col_name: &String, - data_type: &DataType, -) -> Result)>> { + data_type: &LogicalPhysicalType, +) -> Result)>> { let mut qualified_columns = Vec::with_capacity(1); - match data_type { - DataType::List(field) - | DataType::FixedSizeList(field, _) - | DataType::LargeList(field) => { - let new_field = Arc::new(Field::new( + match data_type.logical() { + LogicalType::List(field) => { + let new_field = Arc::new(LogicalPhysicalField::new( col_name.clone(), field.data_type().clone(), // Unnesting may produce NULLs even if the list is not null. @@ -1624,7 +1639,7 @@ pub fn get_unnested_columns( // let column = Column::from((None, &new_field)); qualified_columns.push((column, new_field)); } - DataType::Struct(fields) => { + LogicalType::Struct(fields) => { qualified_columns.extend(fields.iter().map(|f| { let new_name = format!("{}.{}", col_name, f.name()); let column = Column::from_name(&new_name); @@ -1672,11 +1687,9 @@ pub fn unnest_with_options( &column_to_unnest.name, original_field.data_type(), )?; - match original_field.data_type() { - DataType::List(_) - | DataType::FixedSizeList(_, _) - | DataType::LargeList(_) => list_columns.push(index), - DataType::Struct(_) => struct_columns.push(index), + match original_field.data_type().logical() { + LogicalType::List(_) => list_columns.push(index), + LogicalType::Struct(_) => struct_columns.push(index), _ => { panic!( "not reachable, should be caught by get_unnested_columns" @@ -1688,7 +1701,7 @@ pub fn unnest_with_options( .extend(std::iter::repeat(index).take(flatten_columns.len())); Ok(flatten_columns .iter() - .map(|col: &(Column, Arc)| { + .map(|col: &(Column, Arc)| { (col.0.relation.to_owned(), col.1.to_owned()) }) .collect()) @@ -1729,6 +1742,8 @@ mod tests { use super::*; use crate::logical_plan::StringifiedPlan; use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery}; + use arrow::datatypes::{DataType, Field, Fields}; + use datafusion_common::logical_type::TypeRelation; use datafusion_common::SchemaError; @@ -1758,7 +1773,7 @@ mod tests { .unwrap(); let expected = DFSchema::try_from_qualified_schema( TableReference::bare("employee_csv"), - &schema, + &schema.clone().into(), ) .unwrap(); assert_eq!(&expected, plan.schema().as_ref()); @@ -2115,7 +2130,7 @@ mod tests { // Check unnested field is a scalar let field = plan.schema().field_with_name(None, "strings").unwrap(); - assert_eq!(&DataType::Utf8, field.data_type()); + assert_eq!(&LogicalType::Utf8, field.data_type().logical()); // Unnesting the singular struct column result into 2 new columns for each subfield let plan = nested_table_scan("test_table")? @@ -2133,7 +2148,7 @@ mod tests { .schema() .field_with_name(None, &format!("struct_singular.{}", field_name)) .unwrap(); - assert_eq!(&DataType::UInt32, field.data_type()); + assert_eq!(&LogicalType::UInt32, field.data_type().logical()); } // Unnesting multiple fields in separate plans @@ -2152,7 +2167,10 @@ mod tests { // Check unnested struct list field should be a struct. let field = plan.schema().field_with_name(None, "structs").unwrap(); - assert!(matches!(field.data_type(), DataType::Struct(_))); + assert!(matches!( + field.data_type().logical(), + LogicalType::Struct(_) + )); // Unnesting multiple fields at the same time let cols = vec!["strings", "structs", "struct_singular"] @@ -2226,23 +2244,23 @@ mod tests { #[test] fn test_change_redundant_column() -> Result<()> { - let t1_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_1 = Field::new("a", DataType::Int32, false); - let t2_field_3 = Field::new("a", DataType::Int32, false); - let t1_field_2 = Field::new("b", DataType::Int32, false); - let t2_field_2 = Field::new("b", DataType::Int32, false); + let t1_field_1 = LogicalPhysicalField::new("a", DataType::Int32, false); + let t2_field_1 = LogicalPhysicalField::new("a", DataType::Int32, false); + let t2_field_3 = LogicalPhysicalField::new("a", DataType::Int32, false); + let t1_field_2 = LogicalPhysicalField::new("b", DataType::Int32, false); + let t2_field_2 = LogicalPhysicalField::new("b", DataType::Int32, false); let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; - let remove_redundant = change_redundant_column(&Fields::from(field_vec)); + let remove_redundant = change_redundant_column(&LogicalPhysicalFields::from(field_vec)); assert_eq!( remove_redundant, vec![ - Field::new("a", DataType::Int32, false), - Field::new("a:1", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - Field::new("b:1", DataType::Int32, false), - Field::new("a:2", DataType::Int32, false), + LogicalPhysicalField::new("a", DataType::Int32, false), + LogicalPhysicalField::new("a:1", DataType::Int32, false), + LogicalPhysicalField::new("b", DataType::Int32, false), + LogicalPhysicalField::new("b:1", DataType::Int32, false), + LogicalPhysicalField::new("a:2", DataType::Int32, false), ] ); Ok(()) diff --git a/datafusion/expr/src/logical_plan/ddl.rs b/datafusion/expr/src/logical_plan/ddl.rs index 45ddbafecfd7..a5627b7c1f27 100644 --- a/datafusion/expr/src/logical_plan/ddl.rs +++ b/datafusion/expr/src/logical_plan/ddl.rs @@ -24,7 +24,7 @@ use std::{ use crate::{Expr, LogicalPlan, Volatility}; -use arrow::datatypes::DataType; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{Constraints, DFSchemaRef, SchemaReference, TableReference}; use sqlparser::ast::Ident; @@ -322,7 +322,7 @@ pub struct CreateFunction { pub temporary: bool, pub name: String, pub args: Option>, - pub return_type: Option, + pub return_type: Option, pub params: CreateFunctionBody, /// Dummy schema pub schema: DFSchemaRef, @@ -332,7 +332,7 @@ pub struct OperateFunctionArg { // TODO: figure out how to support mode // pub mode: Option, pub name: Option, - pub data_type: DataType, + pub data_type: LogicalPhysicalType, pub default_expr: Option, } #[derive(Clone, PartialEq, Eq, Hash, Debug)] diff --git a/datafusion/expr/src/logical_plan/dml.rs b/datafusion/expr/src/logical_plan/dml.rs index c9eef9bd34cc..87d41c2846ef 100644 --- a/datafusion/expr/src/logical_plan/dml.rs +++ b/datafusion/expr/src/logical_plan/dml.rs @@ -20,12 +20,13 @@ use std::fmt::{self, Display}; use std::hash::{Hash, Hasher}; use std::sync::Arc; -use arrow::datatypes::{DataType, Field, Schema}; +use crate::LogicalPlan; +use arrow::datatypes::DataType; use datafusion_common::file_options::file_type::FileType; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use datafusion_common::{DFSchemaRef, TableReference}; -use crate::LogicalPlan; - /// Operator that copies the contents of a database to file(s) #[derive(Clone)] pub struct CopyTo { @@ -130,7 +131,7 @@ impl Display for WriteOp { fn make_count_schema() -> DFSchemaRef { Arc::new( - Schema::new(vec![Field::new("count", DataType::UInt64, false)]) + LogicalPhysicalSchema::new(vec![LogicalPhysicalField::new("count", DataType::UInt64, false)]) .try_into() .unwrap(), ) diff --git a/datafusion/expr/src/logical_plan/extension.rs b/datafusion/expr/src/logical_plan/extension.rs index 2f581c1928f4..05f0b59c3fda 100644 --- a/datafusion/expr/src/logical_plan/extension.rs +++ b/datafusion/expr/src/logical_plan/extension.rs @@ -349,5 +349,9 @@ impl UserDefinedLogicalNode for T { } fn get_all_columns_from_schema(schema: &DFSchema) -> HashSet { - schema.fields().iter().map(|f| f.name().clone()).collect() + schema + .fields() + .iter() + .map(|f| f.name().to_string()) + .collect() } diff --git a/datafusion/expr/src/logical_plan/plan.rs b/datafusion/expr/src/logical_plan/plan.rs index bda03fb7087a..bd101d41eaa6 100644 --- a/datafusion/expr/src/logical_plan/plan.rs +++ b/datafusion/expr/src/logical_plan/plan.rs @@ -40,7 +40,8 @@ use crate::{ TableProviderFilterPushDown, TableSource, WindowFunctionDefinition, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; +use arrow::datatypes::{DataType, Field, Schema}; +use datafusion_common::logical_type::signature::LogicalType; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; @@ -54,6 +55,9 @@ use datafusion_common::{ use crate::display::PgJsonVisitor; use crate::logical_plan::tree_node::unwrap_arc; pub use datafusion_common::display::{PlanType, StringifiedPlan, ToStringifiedPlan}; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; pub use datafusion_common::{JoinConstraint, JoinType}; /// A `LogicalPlan` is a node in a tree of relational operators (such as @@ -351,20 +355,24 @@ impl LogicalPlan { } /// Returns the (fixed) output schema for explain plans - pub fn explain_schema() -> SchemaRef { - SchemaRef::new(Schema::new(vec![ - Field::new("plan_type", DataType::Utf8, false), - Field::new("plan", DataType::Utf8, false), - ])) + pub fn explain_schema() -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new( + Schema::new(vec![ + Field::new("plan_type", DataType::Utf8, false), + Field::new("plan", DataType::Utf8, false), + ]) + .into(), + ) } /// Returns the (fixed) output schema for `DESCRIBE` plans - pub fn describe_schema() -> Schema { + pub fn describe_schema() -> LogicalPhysicalSchema { Schema::new(vec![ Field::new("column_name", DataType::Utf8, false), Field::new("data_type", DataType::Utf8, false), Field::new("is_nullable", DataType::Utf8, false), ]) + .into() } /// Returns all expressions (non-recursively) evaluated by the current @@ -1387,8 +1395,8 @@ impl LogicalPlan { /// Walk the logical plan, find any `Placeholder` tokens, and return a map of their IDs and DataTypes pub fn get_parameter_types( &self, - ) -> Result>, DataFusionError> { - let mut param_types: HashMap> = HashMap::new(); + ) -> Result>, DataFusionError> { + let mut param_types: HashMap> = HashMap::new(); self.apply_with_subqueries(|plan| { plan.apply_expressions(|expr| { @@ -1650,7 +1658,7 @@ impl LogicalPlan { let schema = source.schema(); let names: Vec<&str> = indices .iter() - .map(|i| schema.field(*i).name().as_str()) + .map(|i| schema.field(*i).name()) .collect(); format!(" projection=[{}]", names.join(", ")) } @@ -1906,7 +1914,7 @@ impl LogicalPlan { .map(|i| &input_columns[*i]) .collect::>(); // get items from input_columns indexed by list_col_indices - write!(f, "Unnest: lists[{}] structs[{}]", + write!(f, "Unnest: lists[{}] structs[{}]", expr_vec_fmt!(list_type_columns), expr_vec_fmt!(struct_type_columns)) } @@ -2084,7 +2092,7 @@ impl SubqueryAlias { // functional dependencies: let func_dependencies = plan.schema().functional_dependencies().clone(); let schema = DFSchemaRef::new( - DFSchema::try_from_qualified_schema(alias.clone(), &schema)? + DFSchema::try_from_qualified_schema(alias.clone(), &schema.into())? .with_functional_dependencies(func_dependencies)?, ); Ok(SubqueryAlias { @@ -2123,7 +2131,7 @@ impl Filter { // construction (such as with correlated subqueries) so we make a best effort here and // ignore errors resolving the expression against the schema. if let Ok(predicate_type) = predicate.get_type(input.schema()) { - if predicate_type != DataType::Boolean { + if *predicate_type.logical() != LogicalType::Boolean { return plan_err!( "Cannot create filter with non-boolean predicate '{predicate}' returning {predicate_type}" ); @@ -2218,7 +2226,7 @@ pub struct Window { impl Window { /// Create a new window operator. pub fn try_new(window_expr: Vec, input: Arc) -> Result { - let fields: Vec<(Option, Arc)> = input + let fields: Vec<(Option, Arc)> = input .schema() .iter() .map(|(q, f)| (q.cloned(), Arc::clone(f))) @@ -2372,9 +2380,7 @@ impl TableScan { let df_schema = DFSchema::new_with_metadata( p.iter() - .map(|i| { - (Some(table_name.clone()), Arc::new(schema.field(*i).clone())) - }) + .map(|i| (Some(table_name.clone()), schema.field(*i).clone())) .collect(), schema.metadata.clone(), )?; @@ -2434,7 +2440,7 @@ pub struct Prepare { /// The name of the statement pub name: String, /// Data types of the parameters ([`Expr::Placeholder`]) - pub data_types: Vec, + pub data_types: Vec, /// The logical plan of the statements pub input: Arc, } @@ -3455,7 +3461,7 @@ digraph { let schema = Arc::new( DFSchema::try_from_qualified_schema( TableReference::bare("tab"), - &source.schema(), + &source.schema().as_ref().clone().into(), ) .unwrap(), ); diff --git a/datafusion/expr/src/planner.rs b/datafusion/expr/src/planner.rs index aeb8ed8372b7..a115a8984f86 100644 --- a/datafusion/expr/src/planner.rs +++ b/datafusion/expr/src/planner.rs @@ -19,14 +19,14 @@ use std::sync::Arc; -use arrow::datatypes::{DataType, SchemaRef}; +use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF}; +use arrow::datatypes::SchemaRef; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{ config::ConfigOptions, file_options::file_type::FileType, not_impl_err, DFSchema, Result, TableReference, }; -use crate::{AggregateUDF, Expr, GetFieldAccess, ScalarUDF, TableSource, WindowUDF}; - /// Provides the `SQL` query planner meta-data about tables and /// functions referenced in SQL statements, without a direct dependency on other /// DataFusion structures @@ -67,7 +67,7 @@ pub trait ContextProvider { /// Getter for a UDWF fn get_window_meta(&self, name: &str) -> Option>; /// Getter for system/user-defined variable type - fn get_variable_type(&self, variable_names: &[String]) -> Option; + fn get_variable_type(&self, variable_names: &[String]) -> Option; /// Get configuration options fn options(&self) -> &ConfigOptions; diff --git a/datafusion/expr/src/simplify.rs b/datafusion/expr/src/simplify.rs index ccf45ff0d048..909147f3221b 100644 --- a/datafusion/expr/src/simplify.rs +++ b/datafusion/expr/src/simplify.rs @@ -17,10 +17,10 @@ //! Structs and traits to provide the information needed for expression simplification. -use arrow::datatypes::DataType; -use datafusion_common::{DFSchemaRef, DataFusionError, Result}; - use crate::{execution_props::ExecutionProps, Expr, ExprSchemable}; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; +use datafusion_common::{DFSchemaRef, DataFusionError, Result}; /// Provides the information necessary to apply algebraic simplification to an /// [Expr]. See [SimplifyContext] for one concrete implementation. @@ -39,7 +39,7 @@ pub trait SimplifyInfo { fn execution_props(&self) -> &ExecutionProps; /// Returns data type of this expr needed for determining optimized int type of a value - fn get_data_type(&self, expr: &Expr) -> Result; + fn get_data_type(&self, expr: &Expr) -> Result; } /// Provides simplification information based on DFSchema and @@ -75,7 +75,9 @@ impl<'a> SimplifyInfo for SimplifyContext<'a> { /// returns true if this Expr has boolean type fn is_boolean_type(&self, expr: &Expr) -> Result { for schema in &self.schema { - if let Ok(DataType::Boolean) = expr.get_type(schema) { + if let Ok(LogicalType::Boolean) = + expr.get_type(schema).map(|t| t.logical().clone()) + { return Ok(true); } } @@ -94,7 +96,7 @@ impl<'a> SimplifyInfo for SimplifyContext<'a> { } /// Returns data type of this expr needed for determining optimized int type of a value - fn get_data_type(&self, expr: &Expr) -> Result { + fn get_data_type(&self, expr: &Expr) -> Result { let schema = self.schema.as_ref().ok_or_else(|| { DataFusionError::Internal( "attempt to get data type without schema".to_string(), diff --git a/datafusion/expr/src/table_source.rs b/datafusion/expr/src/table_source.rs index 2de3cc923315..1cb8f2d3aaf2 100644 --- a/datafusion/expr/src/table_source.rs +++ b/datafusion/expr/src/table_source.rs @@ -19,7 +19,7 @@ use crate::{Expr, LogicalPlan}; -use arrow::datatypes::SchemaRef; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use datafusion_common::{Constraints, Result}; use std::any::Any; @@ -86,7 +86,7 @@ pub trait TableSource: Sync + Send { fn as_any(&self) -> &dyn Any; /// Get a reference to the schema for this table - fn schema(&self) -> SchemaRef; + fn schema(&self) -> LogicalPhysicalSchemaRef; /// Get primary key indices, if one exists. fn constraints(&self) -> Option<&Constraints> { diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 4f79f3fa2b22..6ded17823e0a 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +// TODO(@notfilippo): make most of these accept LogicalType + //! Coercion rules for matching argument types for binary operators use std::collections::HashSet; diff --git a/datafusion/expr/src/type_coercion/functions.rs b/datafusion/expr/src/type_coercion/functions.rs index b430b343e484..ec07b5ad66eb 100644 --- a/datafusion/expr/src/type_coercion/functions.rs +++ b/datafusion/expr/src/type_coercion/functions.rs @@ -585,7 +585,7 @@ fn coerced_from<'a>( // List or LargeList with different dimensions should be handled in TypeSignature or other places before this (List(_) | LargeList(_), _) if datafusion_common::utils::base_type(type_from).eq(&Null) - || list_ndims(type_from) == list_ndims(type_into) => + || list_ndims(&type_from) == list_ndims(&type_into) => { Some(type_into.clone()) } diff --git a/datafusion/expr/src/type_coercion/mod.rs b/datafusion/expr/src/type_coercion/mod.rs index 86005da3dafa..b3243d2b2b21 100644 --- a/datafusion/expr/src/type_coercion/mod.rs +++ b/datafusion/expr/src/type_coercion/mod.rs @@ -36,52 +36,58 @@ pub mod binary; pub mod functions; pub mod other; -use arrow::datatypes::DataType; +use datafusion_common::logical_type::{ + signature::LogicalType, TypeRelation, LogicalPhysicalType, +}; + /// Determine whether the given data type `dt` represents signed numeric values. -pub fn is_signed_numeric(dt: &DataType) -> bool { +pub fn is_signed_numeric(dt: &LogicalPhysicalType) -> bool { + use LogicalType::*; matches!( - dt, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Decimal128(_, _) - | DataType::Decimal256(_, _), + dt.logical(), + Int8 | Int16 + | Int32 + | Int64 + | Float16 + | Float32 + | Float64 + | Decimal128(_, _) + | Decimal256(_, _), ) } /// Determine whether the given data type `dt` is `Null`. -pub fn is_null(dt: &DataType) -> bool { - *dt == DataType::Null +pub fn is_null(dt: &LogicalPhysicalType) -> bool { + *dt.logical() == LogicalType::Null } /// Determine whether the given data type `dt` is a `Timestamp`. -pub fn is_timestamp(dt: &DataType) -> bool { - matches!(dt, DataType::Timestamp(_, _)) +pub fn is_timestamp(dt: &LogicalPhysicalType) -> bool { + matches!(dt.logical(), LogicalType::Timestamp(_, _)) } /// Determine whether the given data type 'dt' is a `Interval`. -pub fn is_interval(dt: &DataType) -> bool { - matches!(dt, DataType::Interval(_)) +pub fn is_interval(dt: &LogicalPhysicalType) -> bool { + matches!(dt.logical(), LogicalType::Interval(_)) } /// Determine whether the given data type `dt` is a `Date` or `Timestamp`. -pub fn is_datetime(dt: &DataType) -> bool { +pub fn is_datetime(dt: &LogicalPhysicalType) -> bool { matches!( - dt, - DataType::Date32 | DataType::Date64 | DataType::Timestamp(_, _) + dt.logical(), + LogicalType::Date | LogicalType::Timestamp(_, _) ) } /// Determine whether the given data type `dt` is a `Utf8` or `LargeUtf8`. -pub fn is_utf8_or_large_utf8(dt: &DataType) -> bool { - matches!(dt, DataType::Utf8 | DataType::LargeUtf8) +pub fn is_utf8_or_large_utf8(dt: &LogicalPhysicalType) -> bool { + matches!(dt.logical(), LogicalType::Utf8) } /// Determine whether the given data type `dt` is a `Decimal`. -pub fn is_decimal(dt: &DataType) -> bool { - matches!(dt, DataType::Decimal128(_, _) | DataType::Decimal256(_, _)) +pub fn is_decimal(dt: &LogicalPhysicalType) -> bool { + matches!( + dt.logical(), + LogicalType::Decimal128(_, _) | LogicalType::Decimal256(_, _) + ) } diff --git a/datafusion/expr/src/utils.rs b/datafusion/expr/src/utils.rs index 45155cbd2c27..503112deb36e 100644 --- a/datafusion/expr/src/utils.rs +++ b/datafusion/expr/src/utils.rs @@ -28,7 +28,8 @@ use crate::{ and, BinaryExpr, Expr, ExprSchemable, Filter, GroupingSet, LogicalPlan, Operator, }; -use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; +use arrow::datatypes::{DataType, TimeUnit}; +use datafusion_common::logical_type::signature::LogicalType; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; @@ -38,6 +39,9 @@ use datafusion_common::{ ScalarValue, TableReference, }; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use sqlparser::ast::{ExceptSelectItem, ExcludeSelectItem, WildcardAdditionalOptions}; /// The value to which `COUNT(*)` is expanded to in @@ -429,7 +433,7 @@ pub fn expand_qualified_wildcard( return plan_err!("Invalid qualifier {qualifier}"); } - let qualified_schema = Arc::new(Schema::new(fields_with_qualified)); + let qualified_schema = Arc::new(LogicalPhysicalSchema::new(fields_with_qualified)); let qualified_dfschema = DFSchema::try_from_qualified_schema(qualifier.clone(), &qualified_schema)? .with_functional_dependencies(projected_func_dependencies)?; @@ -727,7 +731,7 @@ pub fn from_plan( pub fn exprlist_to_fields<'a>( exprs: impl IntoIterator, plan: &LogicalPlan, -) -> Result, Arc)>> { +) -> Result, Arc)>> { // look for exact match in plan's output schema let input_schema = &plan.schema(); exprs @@ -830,41 +834,32 @@ pub(crate) fn find_column_indexes_referenced_by_expr( /// can this data type be used in hash join equal conditions?? /// data types here come from function 'equal_rows', if more data types are supported /// in equal_rows(hash join), add those data types here to generate join logical plan. -pub fn can_hash(data_type: &DataType) -> bool { - match data_type { - DataType::Null => true, - DataType::Boolean => true, - DataType::Int8 => true, - DataType::Int16 => true, - DataType::Int32 => true, - DataType::Int64 => true, - DataType::UInt8 => true, - DataType::UInt16 => true, - DataType::UInt32 => true, - DataType::UInt64 => true, - DataType::Float32 => true, - DataType::Float64 => true, - DataType::Timestamp(time_unit, _) => match time_unit { +pub fn can_hash(data_type: &LogicalPhysicalType) -> bool { + use LogicalType::*; + match data_type.logical() { + Null => true, + Boolean => true, + Int8 => true, + Int16 => true, + Int32 => true, + Int64 => true, + UInt8 => true, + UInt16 => true, + UInt32 => true, + UInt64 => true, + Float32 => true, + Float64 => true, + Timestamp(time_unit, _) => match time_unit { TimeUnit::Second => true, TimeUnit::Millisecond => true, TimeUnit::Microsecond => true, TimeUnit::Nanosecond => true, }, - DataType::Utf8 => true, - DataType::LargeUtf8 => true, - DataType::Decimal128(_, _) => true, - DataType::Date32 => true, - DataType::Date64 => true, - DataType::FixedSizeBinary(_) => true, - DataType::Dictionary(key_type, value_type) - if *value_type.as_ref() == DataType::Utf8 => - { - DataType::is_dictionary_key_type(key_type) - } - DataType::List(_) => true, - DataType::LargeList(_) => true, - DataType::FixedSizeList(_, _) => true, - DataType::Struct(fields) => fields.iter().all(|f| can_hash(f.data_type())), + Utf8 => true, + Decimal128(_, _) => true, + Date => true, + List(_) => true, + Struct(fields) => fields.iter().all(|f| can_hash(f.data_type())), _ => false, } } diff --git a/datafusion/expr/src/var_provider.rs b/datafusion/expr/src/var_provider.rs index e00cf7407237..62150d637b1e 100644 --- a/datafusion/expr/src/var_provider.rs +++ b/datafusion/expr/src/var_provider.rs @@ -17,7 +17,7 @@ //! Variable provider -use arrow::datatypes::DataType; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{Result, ScalarValue}; /// Variable type, system/user defined @@ -35,7 +35,7 @@ pub trait VarProvider: std::fmt::Debug { fn get_value(&self, var_names: Vec) -> Result; /// Return the type of the given variable - fn get_type(&self, var_names: &[String]) -> Option; + fn get_type(&self, var_names: &[String]) -> Option; } pub fn is_system_variables(variable_names: &[String]) -> bool { diff --git a/datafusion/functions-array/src/planner.rs b/datafusion/functions-array/src/planner.rs index cfbe99b4b7fd..29ca30aa5bf1 100644 --- a/datafusion/functions-array/src/planner.rs +++ b/datafusion/functions-array/src/planner.rs @@ -17,6 +17,7 @@ //! SQL planning extensions like [`ArrayFunctionPlanner`] and [`FieldAccessPlanner`] +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{utils::list_ndims, DFSchema, Result}; use datafusion_expr::{ planner::{ExprPlanner, PlannerResult, RawBinaryExpr, RawFieldAccessExpr}, @@ -45,8 +46,8 @@ impl ExprPlanner for ArrayFunctionPlanner { if op == sqlparser::ast::BinaryOperator::StringConcat { let left_type = left.get_type(schema)?; let right_type = right.get_type(schema)?; - let left_list_ndims = list_ndims(&left_type); - let right_list_ndims = list_ndims(&right_type); + let left_list_ndims = list_ndims(left_type.physical()); + let right_list_ndims = list_ndims(right_type.physical()); // Rewrite string concat operator to function based on types // if we get list || list then we rewrite it to array_concat() @@ -73,8 +74,8 @@ impl ExprPlanner for ArrayFunctionPlanner { ) { let left_type = left.get_type(schema)?; let right_type = right.get_type(schema)?; - let left_list_ndims = list_ndims(&left_type); - let right_list_ndims = list_ndims(&right_type); + let left_list_ndims = list_ndims(left_type.physical()); + let right_list_ndims = list_ndims(right_type.physical()); // if both are list if left_list_ndims > 0 && right_list_ndims > 0 { if op == sqlparser::ast::BinaryOperator::AtArrow { diff --git a/datafusion/functions/src/core/arrow_cast.rs b/datafusion/functions/src/core/arrow_cast.rs index 9c410d4e18e8..d3200a0a10d0 100644 --- a/datafusion/functions/src/core/arrow_cast.rs +++ b/datafusion/functions/src/core/arrow_cast.rs @@ -107,7 +107,7 @@ impl ScalarUDFImpl for ArrowCastFunc { info: &dyn SimplifyInfo, ) -> Result { // convert this into a real cast - let target_type = data_type_from_args(&args)?; + let target_type = data_type_from_args(&args)?.into(); // remove second (type) argument args.pop().unwrap(); let arg = args.pop().unwrap(); @@ -130,6 +130,8 @@ impl ScalarUDFImpl for ArrowCastFunc { /// Returns the requested type from the arguments fn data_type_from_args(args: &[Expr]) -> Result { + // TODO(@notfilippo): maybe parse LogicalType? + if args.len() != 2 { return plan_err!("arrow_cast needs 2 arguments, {} provided", args.len()); } diff --git a/datafusion/functions/src/core/getfield.rs b/datafusion/functions/src/core/getfield.rs index b76da15c52ca..4a28b47d7054 100644 --- a/datafusion/functions/src/core/getfield.rs +++ b/datafusion/functions/src/core/getfield.rs @@ -20,6 +20,8 @@ use arrow::array::{ }; use arrow::datatypes::DataType; use datafusion_common::cast::{as_map_array, as_struct_array}; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{ exec_err, plan_datafusion_err, plan_err, ExprSchema, Result, ScalarValue, }; @@ -105,35 +107,36 @@ impl ScalarUDFImpl for GetFieldFunc { ); } }; + // TODO(@notfilippo): avoid converting to physical type let data_type = args[0].get_type(schema)?; - match (data_type, name) { - (DataType::Map(fields, _), _) => { - match fields.data_type() { - DataType::Struct(fields) if fields.len() == 2 => { + match (data_type.logical(), name) { + (LogicalType::Map(fields, _), _) => { + match fields.data_type().logical() { + LogicalType::Struct(fields) if fields.len() == 2 => { // Arrow's MapArray is essentially a ListArray of structs with two columns. They are // often named "key", and "value", but we don't require any specific naming here; // instead, we assume that the second columnis the "value" column both here and in // execution. let value_field = fields.get(1).expect("fields should have exactly two members"); - Ok(value_field.data_type().clone()) + Ok(value_field.data_type().physical().clone()) }, _ => plan_err!("Map fields must contain a Struct with exactly 2 fields"), } } - (DataType::Struct(fields), ScalarValue::Utf8(Some(s))) => { + (LogicalType::Struct(fields), ScalarValue::Utf8(Some(s))) => { if s.is_empty() { plan_err!( "Struct based indexed access requires a non empty string" ) } else { let field = fields.iter().find(|f| f.name() == s); - field.ok_or(plan_datafusion_err!("Field {s} not found in struct")).map(|f| f.data_type().clone()) + field.ok_or(plan_datafusion_err!("Field {s} not found in struct")).map(|f| f.data_type().clone().physical().clone()) } } - (DataType::Struct(_), _) => plan_err!( + (LogicalType::Struct(_), _) => plan_err!( "Only UTF8 strings are valid as an indexed field in a struct" ), - (DataType::Null, _) => Ok(DataType::Null), + (LogicalType::Null, _) => Ok(DataType::Null), (other, _) => plan_err!("The expression to get an indexed field is only valid for `List`, `Struct`, `Map` or `Null` types, got {other}"), } } diff --git a/datafusion/functions/src/core/named_struct.rs b/datafusion/functions/src/core/named_struct.rs index 8ccda977f3a4..39718cbbb73c 100644 --- a/datafusion/functions/src/core/named_struct.rs +++ b/datafusion/functions/src/core/named_struct.rs @@ -17,6 +17,7 @@ use arrow::array::StructArray; use arrow::datatypes::{DataType, Field, Fields}; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{exec_err, internal_err, Result, ScalarValue}; use datafusion_expr::{ColumnarValue, Expr, ExprSchemable}; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; @@ -139,7 +140,7 @@ impl ScalarUDFImpl for NamedStructFunc { let value = &chunk[1]; if let Expr::Literal(ScalarValue::Utf8(Some(name))) = name { - Ok(Field::new(name, value.get_type(schema)?, true)) + Ok(Field::new(name, value.get_type(schema)?.physical().clone(), true)) } else { exec_err!("named_struct even arguments must be string literals, got {name} instead at position {}", i * 2) } diff --git a/datafusion/functions/src/math/log.rs b/datafusion/functions/src/math/log.rs index 0791561539e1..b4fc8fd46ff5 100644 --- a/datafusion/functions/src/math/log.rs +++ b/datafusion/functions/src/math/log.rs @@ -24,6 +24,7 @@ use super::power::PowerFunc; use arrow::array::{ArrayRef, Float32Array, Float64Array}; use arrow::datatypes::DataType; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{ exec_err, internal_err, plan_datafusion_err, plan_err, DataFusionError, Result, ScalarValue, @@ -158,6 +159,7 @@ impl ScalarUDFImpl for LogFunc { Ok(ColumnarValue::Array(arr)) } + // TODO(@notfilippo): avoid converting to physical type /// Simplify the `log` function by the relevant rules: /// 1. Log(a, 1) ===> 0 /// 2. Log(a, Power(a, b)) ===> b @@ -182,13 +184,15 @@ impl ScalarUDFImpl for LogFunc { let base = if let Some(base) = args.pop() { base } else { - lit(ScalarValue::new_ten(&number_datatype)?) + lit(ScalarValue::new_ten(&number_datatype.physical())?) }; match number { - Expr::Literal(value) if value == ScalarValue::new_one(&number_datatype)? => { + Expr::Literal(value) + if value == ScalarValue::new_one(&number_datatype.physical())? => + { Ok(ExprSimplifyResult::Simplified(lit(ScalarValue::new_zero( - &info.get_data_type(&base)?, + &info.get_data_type(&base)?.physical(), )?))) } Expr::ScalarFunction(ScalarFunction { func, mut args }) @@ -200,7 +204,7 @@ impl ScalarUDFImpl for LogFunc { number => { if number == base { Ok(ExprSimplifyResult::Simplified(lit(ScalarValue::new_one( - &number_datatype, + &number_datatype.physical(), )?))) } else { let args = match num_args { diff --git a/datafusion/functions/src/math/power.rs b/datafusion/functions/src/math/power.rs index 5b790fb56ddf..fb4ce2607fd2 100644 --- a/datafusion/functions/src/math/power.rs +++ b/datafusion/functions/src/math/power.rs @@ -19,6 +19,7 @@ use arrow::datatypes::{ArrowNativeTypeOp, DataType}; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{ arrow_datafusion_err, exec_datafusion_err, exec_err, plan_datafusion_err, DataFusionError, Result, ScalarValue, @@ -27,14 +28,13 @@ use datafusion_expr::expr::ScalarFunction; use datafusion_expr::simplify::{ExprSimplifyResult, SimplifyInfo}; use datafusion_expr::{ColumnarValue, Expr, ScalarUDF}; +use super::log::LogFunc; use arrow::array::{ArrayRef, Float64Array, Int64Array}; use datafusion_expr::TypeSignature::*; use datafusion_expr::{ScalarUDFImpl, Signature, Volatility}; use std::any::Any; use std::sync::Arc; -use super::log::LogFunc; - #[derive(Debug)] pub struct PowerFunc { signature: Signature, @@ -127,6 +127,7 @@ impl ScalarUDFImpl for PowerFunc { Ok(ColumnarValue::Array(arr)) } + // TODO(@notfilippo): avoid converting to physical type /// Simplify the `power` function by the relevant rules: /// 1. Power(a, 0) ===> 0 /// 2. Power(a, 1) ===> a @@ -143,11 +144,11 @@ impl ScalarUDFImpl for PowerFunc { plan_datafusion_err!("Expected power to have 2 arguments, got 1") })?; - let exponent_type = info.get_data_type(&exponent)?; + let exponent_type = info.get_data_type(&exponent)?.physical().clone(); match exponent { Expr::Literal(value) if value == ScalarValue::new_zero(&exponent_type)? => { Ok(ExprSimplifyResult::Simplified(Expr::Literal( - ScalarValue::new_one(&info.get_data_type(&base)?)?, + ScalarValue::new_one(&info.get_data_type(&base)?.physical().clone())?, ))) } Expr::Literal(value) if value == ScalarValue::new_one(&exponent_type)? => { diff --git a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs index 959ffdaaa212..fd78b4b26fd8 100644 --- a/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs +++ b/datafusion/optimizer/src/analyzer/count_wildcard_rule.rs @@ -107,9 +107,8 @@ mod tests { WindowFrameUnits, }; use datafusion_functions_aggregate::count::count_udaf; - use std::sync::Arc; - use datafusion_functions_aggregate::expr_fn::{count, sum}; + use std::sync::Arc; fn assert_plan_eq(plan: LogicalPlan, expected: &str) -> Result<()> { assert_analyzed_plan_eq_display_indent( diff --git a/datafusion/optimizer/src/analyzer/function_rewrite.rs b/datafusion/optimizer/src/analyzer/function_rewrite.rs index 098c934bf7e1..86d106bb8975 100644 --- a/datafusion/optimizer/src/analyzer/function_rewrite.rs +++ b/datafusion/optimizer/src/analyzer/function_rewrite.rs @@ -53,7 +53,7 @@ impl ApplyFunctionRewrites { if let LogicalPlan::TableScan(ts) = &plan { let source_schema = DFSchema::try_from_qualified_schema( ts.table_name.clone(), - &ts.source.schema(), + &ts.source.schema().as_ref().clone().into(), )?; schema.merge(&source_schema); } diff --git a/datafusion/optimizer/src/analyzer/inline_table_scan.rs b/datafusion/optimizer/src/analyzer/inline_table_scan.rs index 73ab37cb11d8..40893fa948ef 100644 --- a/datafusion/optimizer/src/analyzer/inline_table_scan.rs +++ b/datafusion/optimizer/src/analyzer/inline_table_scan.rs @@ -106,6 +106,8 @@ mod tests { use crate::test::assert_analyzed_plan_eq; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::logical_type::field::LogicalPhysicalField; + use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; use datafusion_expr::{col, lit, LogicalPlan, LogicalPlanBuilder, TableSource}; pub struct RawTableSource {} @@ -115,10 +117,10 @@ mod tests { self } - fn schema(&self) -> arrow::datatypes::SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int64, false), - Field::new("b", DataType::Int64, false), + fn schema(&self) -> LogicalPhysicalSchemaRef { + Arc::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("a", DataType::Int64, false), + LogicalPhysicalField::new("b", DataType::Int64, false), ])) } @@ -159,8 +161,8 @@ mod tests { Ok(datafusion_expr::TableProviderFilterPushDown::Exact) } - fn schema(&self) -> arrow::datatypes::SchemaRef { - Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)])) + fn schema(&self) -> LogicalPhysicalSchemaRef { + Arc::new(Schema::new(vec![Field::new("a", DataType::Int64, false)]).into()) } fn get_logical_plan(&self) -> Option<&LogicalPlan> { diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 6c08b3e998b3..82166331dfab 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -22,6 +22,8 @@ use std::sync::Arc; use arrow::datatypes::{DataType, IntervalUnit}; use datafusion_common::config::ConfigOptions; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; use datafusion_common::{ exec_err, internal_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, @@ -93,7 +95,7 @@ fn analyze_internal( if let LogicalPlan::TableScan(ts) = &plan { let source_schema = DFSchema::try_from_qualified_schema( ts.table_name.clone(), - &ts.source.schema(), + &ts.source.schema().as_ref().clone().into(), )?; schema.merge(&source_schema); } @@ -161,13 +163,13 @@ impl<'a> TypeCoercionRewriter<'a> { right: Expr, ) -> Result<(Expr, Expr)> { let (left_type, right_type) = get_input_types( - &left.get_type(self.schema)?, + &left.get_type(self.schema)?.physical(), &op, - &right.get_type(self.schema)?, + &right.get_type(self.schema)?.physical(), )?; Ok(( - left.cast_to(&left_type, self.schema)?, - right.cast_to(&right_type, self.schema)?, + left.cast_to(&left_type.into(), self.schema)?, + right.cast_to(&right_type.into(), self.schema)?, )) } } @@ -210,7 +212,7 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { analyze_internal(self.schema, unwrap_arc(subquery.subquery))?.data; let expr_type = expr.get_type(self.schema)?; let subquery_type = new_plan.schema().field(0).data_type(); - let common_type = comparison_coercion(&expr_type, subquery_type).ok_or(plan_datafusion_err!( + let common_type = comparison_coercion(&expr_type.physical(), &subquery_type.physical()).ok_or(plan_datafusion_err!( "expr type {expr_type:?} can't cast to {subquery_type:?} in InSubquery" ), )?; @@ -219,8 +221,8 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { outer_ref_columns: subquery.outer_ref_columns, }; Ok(Transformed::yes(Expr::InSubquery(InSubquery::new( - Box::new(expr.cast_to(&common_type, self.schema)?), - cast_subquery(new_subquery, &common_type)?, + Box::new(expr.cast_to(&common_type.clone().into(), self.schema)?), + cast_subquery(new_subquery, &common_type.into())?, negated, )))) } @@ -255,7 +257,7 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { }) => { let left_type = expr.get_type(self.schema)?; let right_type = pattern.get_type(self.schema)?; - let coerced_type = like_coercion(&left_type, &right_type).ok_or_else(|| { + let coerced_type = like_coercion(&left_type.physical(), &right_type.physical()).ok_or_else(|| { let op_name = if case_insensitive { "ILIKE" } else { @@ -266,10 +268,12 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { ) })?; let expr = match left_type { - DataType::Dictionary(_, inner) if *inner == DataType::Utf8 => expr, - _ => Box::new(expr.cast_to(&coerced_type, self.schema)?), + _ => { + Box::new(expr.cast_to(&coerced_type.clone().into(), self.schema)?) + } }; - let pattern = Box::new(pattern.cast_to(&coerced_type, self.schema)?); + let pattern = + Box::new(pattern.cast_to(&coerced_type.into(), self.schema)?); Ok(Transformed::yes(Expr::Like(Like::new( negated, expr, @@ -294,14 +298,14 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { }) => { let expr_type = expr.get_type(self.schema)?; let low_type = low.get_type(self.schema)?; - let low_coerced_type = comparison_coercion(&expr_type, &low_type) + let low_coerced_type = comparison_coercion(&expr_type.physical(), &low_type.physical()) .ok_or_else(|| { DataFusionError::Internal(format!( "Failed to coerce types {expr_type} and {low_type} in BETWEEN expression" )) })?; let high_type = high.get_type(self.schema)?; - let high_coerced_type = comparison_coercion(&expr_type, &low_type) + let high_coerced_type = comparison_coercion(&expr_type.physical(), &low_type.physical()) .ok_or_else(|| { DataFusionError::Internal(format!( "Failed to coerce types {expr_type} and {high_type} in BETWEEN expression" @@ -313,7 +317,7 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { DataFusionError::Internal(format!( "Failed to coerce types {expr_type} and {high_type} in BETWEEN expression" )) - })?; + })?.into(); Ok(Transformed::yes(Expr::Between(Between::new( Box::new(expr.cast_to(&coercion_type, self.schema)?), negated, @@ -326,24 +330,32 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { list, negated, }) => { + println!("{:?}", self.schema); let expr_data_type = expr.get_type(self.schema)?; let list_data_types = list .iter() - .map(|list_expr| list_expr.get_type(self.schema)) + .map(|list_expr| { + list_expr + .get_type(self.schema) + .map(|t| t.physical().clone()) + }) .collect::>>()?; - let result_type = - get_coerce_type_for_list(&expr_data_type, &list_data_types); + let result_type = get_coerce_type_for_list( + &expr_data_type.physical(), + &list_data_types, + ); match result_type { None => plan_err!( "Can not find compatible types to compare {expr_data_type:?} with {list_data_types:?}" ), Some(coerced_type) => { // find the coerced type - let cast_expr = expr.cast_to(&coerced_type, self.schema)?; + let logical_coerced_type = coerced_type.into(); + let cast_expr = expr.cast_to(&logical_coerced_type, self.schema)?; let cast_list_expr = list .into_iter() .map(|list_expr| { - list_expr.cast_to(&coerced_type, self.schema) + list_expr.cast_to(&logical_coerced_type, self.schema) }) .collect::>>()?; Ok(Transformed::yes(Expr::InList(InList ::new( @@ -364,7 +376,6 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { self.schema, &func, )?; - let new_expr = coerce_arguments_for_fun(new_expr, self.schema, &func)?; Ok(Transformed::yes(Expr::ScalarFunction( ScalarFunction::new_udf(func, new_expr), ))) @@ -473,16 +484,16 @@ impl<'a> TreeNodeRewriter for TypeCoercionRewriter<'a> { /// Casts the given `value` to `target_type`. Note that this function /// only considers `Null` or `Utf8` values. -fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result { +fn coerce_scalar(target_type: &LogicalPhysicalType, value: &ScalarValue) -> Result { match value { // Coerce Utf8 values: ScalarValue::Utf8(Some(val)) => { - ScalarValue::try_from_string(val.clone(), target_type) + ScalarValue::try_from_string(val.clone(), &target_type.physical()) } s => { if s.is_null() { // Coerce `Null` values: - ScalarValue::try_from(target_type) + ScalarValue::try_from(target_type.physical().clone()) } else { // Values except `Utf8`/`Null` variants already have the right type // (casted before) since we convert `sqlparser` outputs to `Utf8` @@ -500,15 +511,15 @@ fn coerce_scalar(target_type: &DataType, value: &ScalarValue) -> Result Result { coerce_scalar(target_type, &value).or_else(|err| { // If type coercion fails, check if the largest type in family works: if let Some(largest_type) = get_widest_type_in_family(target_type) { - coerce_scalar(largest_type, &value).map_or_else( + coerce_scalar(&largest_type, &value).map_or_else( |_| exec_err!("Cannot cast {value:?} to {target_type:?}"), - |_| ScalarValue::try_from(target_type), + |_| ScalarValue::try_from(target_type.physical().clone()), ) } else { Err(err) @@ -519,18 +530,19 @@ fn coerce_scalar_range_aware( /// This function returns the widest type in the family of `given_type`. /// If the given type is already the widest type, it returns `None`. /// For example, if `given_type` is `Int8`, it returns `Int64`. -fn get_widest_type_in_family(given_type: &DataType) -> Option<&DataType> { - match given_type { - DataType::UInt8 | DataType::UInt16 | DataType::UInt32 => Some(&DataType::UInt64), - DataType::Int8 | DataType::Int16 | DataType::Int32 => Some(&DataType::Int64), - DataType::Float16 | DataType::Float32 => Some(&DataType::Float64), +fn get_widest_type_in_family(given_type: &LogicalPhysicalType) -> Option { + use LogicalType::*; + match given_type.logical() { + UInt8 | UInt16 | UInt32 => Some(DataType::UInt64.into()), + Int8 | Int16 | Int32 => Some(DataType::Int64.into()), + Float16 | Float32 => Some(DataType::Float64.into()), _ => None, } } /// Coerces the given (window frame) `bound` to `target_type`. fn coerce_frame_bound( - target_type: &DataType, + target_type: &LogicalPhysicalType, bound: WindowFrameBound, ) -> Result { match bound { @@ -559,13 +571,13 @@ fn coerce_window_frame( let target_type = match window_frame.units { WindowFrameUnits::Range => { if let Some(col_type) = current_types.first() { - if col_type.is_numeric() + if col_type.logical().is_numeric() || is_utf8_or_large_utf8(col_type) - || matches!(col_type, DataType::Null) + || matches!(col_type.logical(), LogicalType::Null) { - col_type + col_type.clone() } else if is_datetime(col_type) { - &DataType::Interval(IntervalUnit::MonthDayNano) + DataType::Interval(IntervalUnit::MonthDayNano).into() } else { return internal_err!( "Cannot run range queries on datatype: {col_type:?}" @@ -575,10 +587,11 @@ fn coerce_window_frame( return internal_err!("ORDER BY column cannot be empty"); } } - WindowFrameUnits::Rows | WindowFrameUnits::Groups => &DataType::UInt64, + WindowFrameUnits::Rows | WindowFrameUnits::Groups => DataType::UInt64.into(), }; - window_frame.start_bound = coerce_frame_bound(target_type, window_frame.start_bound)?; - window_frame.end_bound = coerce_frame_bound(target_type, window_frame.end_bound)?; + window_frame.start_bound = + coerce_frame_bound(&target_type, window_frame.start_bound)?; + window_frame.end_bound = coerce_frame_bound(&target_type, window_frame.end_bound)?; Ok(window_frame) } @@ -586,8 +599,12 @@ fn coerce_window_frame( // The above op will be rewrite to the binary op when creating the physical op. fn get_casted_expr_for_bool_op(expr: Expr, schema: &DFSchema) -> Result { let left_type = expr.get_type(schema)?; - get_input_types(&left_type, &Operator::IsDistinctFrom, &DataType::Boolean)?; - expr.cast_to(&DataType::Boolean, schema) + get_input_types( + &left_type.physical(), + &Operator::IsDistinctFrom, + &DataType::Boolean, + )?; + expr.cast_to(&DataType::Boolean.into(), schema) } /// Returns `expressions` coerced to types compatible with @@ -605,15 +622,15 @@ fn coerce_arguments_for_signature_with_scalar_udf( let current_types = expressions .iter() - .map(|e| e.get_type(schema)) + .map(|e| e.get_type(schema).map(|t| t.physical().clone())) .collect::>>()?; let new_types = data_types_with_scalar_udf(¤t_types, func)?; expressions .into_iter() - .enumerate() - .map(|(i, expr)| expr.cast_to(&new_types[i], schema)) + .zip(new_types) + .map(|(expr, t)| expr.cast_to(&t.into(), schema)) .collect() } @@ -632,42 +649,18 @@ fn coerce_arguments_for_signature_with_aggregate_udf( let current_types = expressions .iter() - .map(|e| e.get_type(schema)) + .map(|e| e.get_type(schema).map(|t| t.physical().clone())) .collect::>>()?; let new_types = data_types_with_aggregate_udf(¤t_types, func)?; expressions .into_iter() - .enumerate() - .map(|(i, expr)| expr.cast_to(&new_types[i], schema)) + .zip(new_types) + .map(|(expr, t)| expr.cast_to(&t.into(), schema)) .collect() } -fn coerce_arguments_for_fun( - expressions: Vec, - schema: &DFSchema, - fun: &Arc, -) -> Result> { - // Cast Fixedsizelist to List for array functions - if fun.name() == "make_array" { - expressions - .into_iter() - .map(|expr| { - let data_type = expr.get_type(schema).unwrap(); - if let DataType::FixedSizeList(field, _) = data_type { - let to_type = DataType::List(field.clone()); - expr.cast_to(&to_type, schema) - } else { - Ok(expr) - } - }) - .collect() - } else { - Ok(expressions) - } -} - /// Returns the coerced exprs for each `input_exprs`. /// Get the coerced data type from `aggregate_rule::coerce_types` and add `try_cast` if the /// data type of `input_exprs` need to be coerced. @@ -682,7 +675,7 @@ fn coerce_agg_exprs_for_signature( } let current_types = input_exprs .iter() - .map(|e| e.get_type(schema)) + .map(|e| e.get_type(schema).map(|t| t.physical().clone())) .collect::>>()?; let coerced_types = @@ -690,8 +683,8 @@ fn coerce_agg_exprs_for_signature( input_exprs .into_iter() - .enumerate() - .map(|(i, expr)| expr.cast_to(&coerced_types[i], schema)) + .zip(coerced_types) + .map(|(expr, t)| expr.cast_to(&t.into(), schema)) .collect() } @@ -735,12 +728,12 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { let then_types = case .when_then_expr .iter() - .map(|(_when, then)| then.get_type(schema)) + .map(|(_when, then)| then.get_type(schema).map(|t| t.physical().clone())) .collect::>>()?; let else_type = case .else_expr .as_ref() - .map(|expr| expr.get_type(schema)) + .map(|expr| expr.get_type(schema).map(|t| t.physical().clone())) .transpose()?; // find common coercible types @@ -750,10 +743,12 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { let when_types = case .when_then_expr .iter() - .map(|(when, _then)| when.get_type(schema)) + .map(|(when, _then)| when.get_type(schema).map(|t| t.physical().clone())) .collect::>>()?; - let coerced_type = - get_coerce_type_for_case_expression(&when_types, Some(case_type)); + let coerced_type = get_coerce_type_for_case_expression( + &when_types, + Some(&case_type.physical()), + ); coerced_type.ok_or_else(|| { plan_datafusion_err!( "Failed to coerce case ({case_type:?}) and when ({when_types:?}) \ @@ -776,7 +771,9 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { let case_expr = case .expr .zip(case_when_coerce_type.as_ref()) - .map(|(case_expr, coercible_type)| case_expr.cast_to(coercible_type, schema)) + .map(|(case_expr, coercible_type)| { + case_expr.cast_to(&coercible_type.into(), schema) + }) .transpose()? .map(Box::new); let when_then = case @@ -784,7 +781,7 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { .into_iter() .map(|(when, then)| { let when_type = case_when_coerce_type.as_ref().unwrap_or(&DataType::Boolean); - let when = when.cast_to(when_type, schema).map_err(|e| { + let when = when.cast_to(&when_type.into(), schema).map_err(|e| { DataFusionError::Context( format!( "WHEN expressions in CASE couldn't be \ @@ -793,13 +790,13 @@ fn coerce_case_expression(case: Case, schema: &DFSchema) -> Result { Box::new(e), ) })?; - let then = then.cast_to(&then_else_coerce_type, schema)?; + let then = then.cast_to(&then_else_coerce_type.clone().into(), schema)?; Ok((Box::new(when), Box::new(then))) }) .collect::>>()?; let else_expr = case .else_expr - .map(|expr| expr.cast_to(&then_else_coerce_type, schema)) + .map(|expr| expr.cast_to(&then_else_coerce_type.into(), schema)) .transpose()? .map(Box::new); @@ -814,6 +811,8 @@ mod test { use arrow::datatypes::DataType::Utf8; use arrow::datatypes::{DataType, Field, TimeUnit}; + use datafusion_common::logical_type::field::LogicalPhysicalField; + use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::tree_node::{TransformedResult, TreeNode}; use datafusion_common::{DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{self, InSubquery, Like, ScalarFunction}; @@ -839,12 +838,12 @@ mod test { })) } - fn empty_with_type(data_type: DataType) -> Arc { + fn empty_with_type(data_type: impl Into) -> Arc { Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new( DFSchema::from_unqualified_fields( - vec![Field::new("a", data_type, true)].into(), + vec![LogicalPhysicalField::new("a", data_type, true)].into(), std::collections::HashMap::new(), ) .unwrap(), @@ -894,7 +893,7 @@ mod test { } fn return_type(&self, _args: &[DataType]) -> Result { - Ok(DataType::Utf8) + Ok(Utf8) } fn invoke(&self, _args: &[ColumnarValue]) -> Result { @@ -1082,7 +1081,7 @@ mod test { let empty = Arc::new(LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: Arc::new(DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Decimal128(12, 4), true)].into(), + vec![LogicalPhysicalField::new("a", DataType::Decimal128(12, 4), true)].into(), std::collections::HashMap::new(), )?), })); @@ -1193,6 +1192,7 @@ mod test { let plan = LogicalPlan::Projection(Projection::try_new(vec![like_expr], empty)?); let err = assert_analyzed_plan_eq(Arc::new(TypeCoercion::new()), plan, expected); assert!(err.is_err()); + println!("{:?}", err); assert!(err.unwrap_err().to_string().contains( "There isn't a common type to coerce Int64 and Utf8 in LIKE expression" )); @@ -1279,7 +1279,7 @@ mod test { fn test_type_coercion_rewrite() -> Result<()> { // gt let schema = Arc::new(DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Int64, true)].into(), + vec![LogicalPhysicalField::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema: &schema }; @@ -1290,7 +1290,7 @@ mod test { // eq let schema = Arc::new(DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Int64, true)].into(), + vec![LogicalPhysicalField::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema: &schema }; @@ -1301,7 +1301,7 @@ mod test { // lt let schema = Arc::new(DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Int64, true)].into(), + vec![LogicalPhysicalField::new("a", DataType::Int64, true)].into(), std::collections::HashMap::new(), )?); let mut rewriter = TypeCoercionRewriter { schema: &schema }; @@ -1331,7 +1331,7 @@ mod test { fn cast_if_not_same_type( expr: Box, - data_type: &DataType, + data_type: &LogicalPhysicalType, schema: &DFSchemaRef, ) -> Box { if &expr.get_type(schema).unwrap() != data_type { @@ -1343,8 +1343,8 @@ mod test { fn cast_helper( case: Case, - case_when_type: DataType, - then_else_type: DataType, + case_when_type: LogicalPhysicalType, + then_else_type: LogicalPhysicalType, schema: &DFSchemaRef, ) -> Case { let expr = case @@ -1375,23 +1375,23 @@ mod test { fn test_case_expression_coercion() -> Result<()> { let schema = Arc::new(DFSchema::from_unqualified_fields( vec![ - Field::new("boolean", DataType::Boolean, true), - Field::new("integer", DataType::Int32, true), - Field::new("float", DataType::Float32, true), - Field::new( + LogicalPhysicalField::new("boolean", DataType::Boolean, true), + LogicalPhysicalField::new("integer", DataType::Int32, true), + LogicalPhysicalField::new("float", DataType::Float32, true), + LogicalPhysicalField::new( "timestamp", DataType::Timestamp(TimeUnit::Nanosecond, None), true, ), - Field::new("date", DataType::Date32, true), - Field::new( + LogicalPhysicalField::new("date", DataType::Date32, true), + LogicalPhysicalField::new( "interval", DataType::Interval(arrow::datatypes::IntervalUnit::MonthDayNano), true, ), - Field::new("binary", DataType::Binary, true), - Field::new("string", DataType::Utf8, true), - Field::new("decimal", DataType::Decimal128(10, 10), true), + LogicalPhysicalField::new("binary", DataType::Binary, true), + LogicalPhysicalField::new("string", DataType::Utf8, true), + LogicalPhysicalField::new("decimal", DataType::Decimal128(10, 10), true), ] .into(), std::collections::HashMap::new(), @@ -1410,8 +1410,8 @@ mod test { let then_else_common_type = DataType::Utf8; let expected = cast_helper( case.clone(), - case_when_common_type, - then_else_common_type, + case_when_common_type.into(), + then_else_common_type.into(), &schema, ); let actual = coerce_case_expression(case, &schema)?; @@ -1430,8 +1430,8 @@ mod test { let then_else_common_type = DataType::Utf8; let expected = cast_helper( case.clone(), - case_when_common_type, - then_else_common_type, + case_when_common_type.into(), + then_else_common_type.into(), &schema, ); let actual = coerce_case_expression(case, &schema)?; diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 4a4933fe9cfd..68be437f77e8 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -1132,6 +1132,7 @@ mod test { use std::iter; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_expr::expr::AggregateFunction; use datafusion_expr::logical_plan::{table_scan, JoinType}; use datafusion_expr::{ @@ -1743,9 +1744,9 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("c")]]); let schema = DFSchema::from_unqualified_fields( vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), - Field::new("c", DataType::Int32, false), + LogicalPhysicalField::new("a", DataType::Int32, false), + LogicalPhysicalField::new("b", DataType::Int32, false), + LogicalPhysicalField::new("c", DataType::Int32, false), ] .into(), HashMap::default(), @@ -1762,8 +1763,8 @@ mod test { let grouping = grouping_set(vec![vec![col("a"), col("b")], vec![col("a")]]); let schema = DFSchema::from_unqualified_fields( vec![ - Field::new("a", DataType::Int32, false), - Field::new("b", DataType::Int32, false), + LogicalPhysicalField::new("a", DataType::Int32, false), + LogicalPhysicalField::new("b", DataType::Int32, false), ] .into(), HashMap::default(), @@ -1830,7 +1831,7 @@ mod test { fn test_extract_expressions_from_col() -> Result<()> { let mut result = Vec::with_capacity(1); let schema = DFSchema::from_unqualified_fields( - vec![Field::new("a", DataType::Int32, false)].into(), + vec![LogicalPhysicalField::new("a", DataType::Int32, false)].into(), HashMap::default(), )?; extract_expressions(&col("a"), &schema, &mut result)?; diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index edf6b72d7e17..c0a6e7d9b581 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -65,6 +65,7 @@ mod tests { use super::*; use crate::test::*; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::logical_type::schema::LogicalPhysicalSchema; use datafusion_common::ToDFSchema; use datafusion_expr::{ expr_rewriter::coerce_plan_expr_for_schema, logical_plan::table_scan, @@ -108,7 +109,7 @@ mod tests { fn eliminate_one_union() -> Result<()> { let table_plan = coerce_plan_expr_for_schema( &table_scan(Some("table"), &schema(), None)?.build()?, - &schema().to_dfschema()?, + &LogicalPhysicalSchema::from(schema()).to_dfschema()?, )?; let schema = table_plan.schema().clone(); let single_union_plan = LogicalPlan::Union(Union { diff --git a/datafusion/optimizer/src/extract_equijoin_predicate.rs b/datafusion/optimizer/src/extract_equijoin_predicate.rs index 87d205139e8e..2c4b47b4d5a0 100644 --- a/datafusion/optimizer/src/extract_equijoin_predicate.rs +++ b/datafusion/optimizer/src/extract_equijoin_predicate.rs @@ -362,8 +362,8 @@ mod tests { // filter: t1.a + CAST(Int64(1), UInt32) = t2.a + CAST(Int64(2), UInt32) as t1.a + 1 = t2.a + 2 let filter = Expr::eq( - col("t1.a") + lit(1i64).cast_to(&DataType::UInt32, &t1_schema)?, - col("t2.a") + lit(2i32).cast_to(&DataType::UInt32, &t2_schema)?, + col("t1.a") + lit(1i64).cast_to(&DataType::UInt32.into(), &t1_schema)?, + col("t2.a") + lit(2i32).cast_to(&DataType::UInt32.into(), &t2_schema)?, ) .alias("t1.a + 1 = t2.a + 2"); let plan = LogicalPlanBuilder::from(t1) diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index 4684dbd3b043..c2810718fd9a 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -807,6 +807,7 @@ mod tests { }; use crate::{OptimizerContext, OptimizerRule}; use arrow::datatypes::{DataType, Field, Schema}; + use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_common::{ Column, DFSchema, DFSchemaRef, JoinType, Result, TableReference, }; @@ -1544,15 +1545,15 @@ mod tests { vec![ ( Some("test".into()), - Arc::new(Field::new("a", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("a", DataType::UInt32, false)) ), ( Some("test".into()), - Arc::new(Field::new("b", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("b", DataType::UInt32, false)) ), ( Some("test2".into()), - Arc::new(Field::new("c1", DataType::UInt32, true)) + Arc::new(LogicalPhysicalField::new("c1", DataType::UInt32, true)) ), ], HashMap::new() @@ -1596,15 +1597,15 @@ mod tests { vec![ ( Some("test".into()), - Arc::new(Field::new("a", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("a", DataType::UInt32, false)) ), ( Some("test".into()), - Arc::new(Field::new("b", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("b", DataType::UInt32, false)) ), ( Some("test2".into()), - Arc::new(Field::new("c1", DataType::UInt32, true)) + Arc::new(LogicalPhysicalField::new("c1", DataType::UInt32, true)) ), ], HashMap::new() @@ -1646,15 +1647,15 @@ mod tests { vec![ ( Some("test".into()), - Arc::new(Field::new("a", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("a", DataType::UInt32, false)) ), ( Some("test".into()), - Arc::new(Field::new("b", DataType::UInt32, false)) + Arc::new(LogicalPhysicalField::new("b", DataType::UInt32, false)) ), ( Some("test2".into()), - Arc::new(Field::new("a", DataType::UInt32, true)) + Arc::new(LogicalPhysicalField::new("a", DataType::UInt32, true)) ), ], HashMap::new() diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index 14e5ac141eeb..b1b43884e9a3 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -468,7 +468,10 @@ pub(crate) fn assert_schema_is_the_same( prev_schema: &DFSchema, new_plan: &LogicalPlan, ) -> Result<()> { - let equivalent = new_plan.schema().equivalent_names_and_types(prev_schema); + // TODO(@notfilippo): this was changed from equivalent_names_and_types because of arrow_cast. Is it ok? + let equivalent = new_plan + .schema() + .logically_equivalent_names_and_types(prev_schema); if !equivalent { let e = DataFusionError::Internal(format!( @@ -540,17 +543,17 @@ mod tests { "Optimizer rule 'get table_scan rule' failed\n\ caused by\nget table_scan rule\ncaused by\n\ Internal error: Failed due to a difference in schemas, \ - original schema: DFSchema { inner: Schema { \ + original schema: DFSchema { inner: LogicalSchema { \ fields: [], \ metadata: {} }, \ field_qualifiers: [], \ functional_dependencies: FunctionalDependencies { deps: [] } \ }, \ - new schema: DFSchema { inner: Schema { \ + new schema: DFSchema { inner: LogicalSchema { \ fields: [\ - Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ - Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }\ + LogicalField { name: \"a\", data_type: UInt32, nullable: false, metadata: {} }, \ + LogicalField { name: \"b\", data_type: UInt32, nullable: false, metadata: {} }, \ + LogicalField { name: \"c\", data_type: UInt32, nullable: false, metadata: {} }\ ], \ metadata: {} }, \ field_qualifiers: [Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" }), Some(Bare { table: \"test\" })], \ diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index 88bd1b17883b..f5ec4d428e27 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -248,8 +248,9 @@ fn empty_child(plan: &LogicalPlan) -> Result> { mod tests { use std::sync::Arc; - use arrow::datatypes::{DataType, Field, Schema}; + use arrow::datatypes::{DataType, Field, Fields, Schema}; + use datafusion_common::logical_type::fields::LogicalPhysicalFields; use datafusion_common::{Column, DFSchema, JoinType, ScalarValue}; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ @@ -576,7 +577,7 @@ mod tests { fn test_empty_with_non_empty() -> Result<()> { let table_scan = test_table_scan()?; - let fields = test_table_scan_fields(); + let fields = LogicalPhysicalFields::from(Fields::from(test_table_scan_fields())); let empty = LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, diff --git a/datafusion/optimizer/src/push_down_filter.rs b/datafusion/optimizer/src/push_down_filter.rs index 1c3186b762b7..cfcd0ea33ef3 100644 --- a/datafusion/optimizer/src/push_down_filter.rs +++ b/datafusion/optimizer/src/push_down_filter.rs @@ -1197,9 +1197,10 @@ mod tests { use std::any::Any; use std::fmt::{Debug, Formatter}; - use arrow::datatypes::{DataType, Field, Schema, SchemaRef}; + use arrow::datatypes::{DataType, Field, Schema}; use async_trait::async_trait; - + use datafusion_common::logical_type::field::LogicalPhysicalField; + use datafusion_common::logical_type::schema::{LogicalPhysicalSchema, LogicalPhysicalSchemaRef}; use datafusion_common::ScalarValue; use datafusion_expr::expr::ScalarFunction; use datafusion_expr::logical_plan::table_scan; @@ -2405,10 +2406,10 @@ mod tests { #[async_trait] impl TableSource for PushDownProvider { - fn schema(&self) -> SchemaRef { - Arc::new(Schema::new(vec![ - Field::new("a", DataType::Int32, true), - Field::new("b", DataType::Int32, true), + fn schema(&self) -> LogicalPhysicalSchemaRef { + Arc::new(LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("a", DataType::Int32, true), + LogicalPhysicalField::new("b", DataType::Int32, true), ])) } @@ -2436,9 +2437,9 @@ mod tests { let table_scan = LogicalPlan::TableScan(TableScan { table_name: "test".into(), filters: vec![], - projected_schema: Arc::new(DFSchema::try_from( + projected_schema: Arc::new(DFSchema::try_from(LogicalPhysicalSchema::from( (*test_provider.schema()).clone(), - )?), + ))?), projection: None, source: Arc::new(test_provider), fetch: None, @@ -2508,9 +2509,9 @@ mod tests { let table_scan = LogicalPlan::TableScan(TableScan { table_name: "test".into(), filters: vec![col("a").eq(lit(10i64)), col("b").gt(lit(11i64))], - projected_schema: Arc::new(DFSchema::try_from( + projected_schema: Arc::new(DFSchema::try_from(LogicalPhysicalSchema::from( (*test_provider.schema()).clone(), - )?), + ))?), projection: Some(vec![0]), source: Arc::new(test_provider), fetch: None, @@ -2537,9 +2538,9 @@ mod tests { let table_scan = LogicalPlan::TableScan(TableScan { table_name: "test".into(), filters: vec![], - projected_schema: Arc::new(DFSchema::try_from( + projected_schema: Arc::new(DFSchema::try_from(LogicalPhysicalSchema::from( (*test_provider.schema()).clone(), - )?), + ))?), projection: Some(vec![0]), source: Arc::new(test_provider), fetch: None, diff --git a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs index 36dd85ac96e1..2ec6960ac498 100644 --- a/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs +++ b/datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs @@ -29,6 +29,7 @@ use arrow::{ use datafusion_common::{ cast::{as_large_list_array, as_list_array}, + logical_type::TypeRelation, tree_node::{Transformed, TransformedResult, TreeNode, TreeNodeRewriter}, }; use datafusion_common::{internal_err, DFSchema, DataFusionError, Result, ScalarValue}; @@ -943,7 +944,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: Multiply, right, }) if !info.nullable(&left)? - && !info.get_data_type(&left)?.is_floating() + && !info.get_data_type(&left)?.logical().is_floating() && is_zero(&right) => { Transformed::yes(*right) @@ -954,7 +955,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: Multiply, right, }) if !info.nullable(&right)? - && !info.get_data_type(&right)?.is_floating() + && !info.get_data_type(&right)?.logical().is_floating() && is_zero(&left) => { Transformed::yes(*left) @@ -1005,7 +1006,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { op: Modulo, right, }) if !info.nullable(&left)? - && !info.get_data_type(&left)?.is_floating() + && !info.get_data_type(&left)?.logical().is_floating() && is_one(&right) => { Transformed::yes(lit(0)) @@ -1050,7 +1051,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&left, &right) && !info.nullable(&right)? => { Transformed::yes(Expr::Literal(ScalarValue::new_zero( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1061,7 +1062,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&right, &left) && !info.nullable(&left)? => { Transformed::yes(Expr::Literal(ScalarValue::new_zero( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1136,7 +1137,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&left, &right) && !info.nullable(&right)? => { Transformed::yes(Expr::Literal(ScalarValue::new_negative_one( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1147,7 +1148,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&right, &left) && !info.nullable(&left)? => { Transformed::yes(Expr::Literal(ScalarValue::new_negative_one( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1222,7 +1223,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&left, &right) && !info.nullable(&right)? => { Transformed::yes(Expr::Literal(ScalarValue::new_negative_one( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1233,7 +1234,7 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { right, }) if is_negative_of(&right, &left) && !info.nullable(&left)? => { Transformed::yes(Expr::Literal(ScalarValue::new_negative_one( - &info.get_data_type(&left)?, + &info.get_data_type(&left)?.physical(), )?)) } @@ -1245,7 +1246,9 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) if expr_contains(&left, &right, BitwiseXor) => { let expr = delete_xor_in_complex_expr(&left, &right, false); Transformed::yes(if expr == *right { - Expr::Literal(ScalarValue::new_zero(&info.get_data_type(&right)?)?) + Expr::Literal(ScalarValue::new_zero( + &info.get_data_type(&right)?.physical(), + )?) } else { expr }) @@ -1259,7 +1262,9 @@ impl<'a, S: SimplifyInfo> TreeNodeRewriter for Simplifier<'a, S> { }) if expr_contains(&right, &left, BitwiseXor) => { let expr = delete_xor_in_complex_expr(&right, &left, true); Transformed::yes(if expr == *left { - Expr::Literal(ScalarValue::new_zero(&info.get_data_type(&left)?)?) + Expr::Literal(ScalarValue::new_zero( + &info.get_data_type(&left)?.physical(), + )?) } else { expr }) @@ -1769,7 +1774,13 @@ fn inlist_except(mut l1: InList, l2: InList) -> Result { #[cfg(test)] mod tests { - use datafusion_common::{assert_contains, DFSchemaRef, ToDFSchema}; + use crate::simplify_expressions::SimplifyContext; + use crate::test::test_table_scan_with_name; + use datafusion_common::logical_type::field::LogicalPhysicalField; + use datafusion_common::logical_type::schema::LogicalPhysicalSchema; + use datafusion_common::{ + assert_contains, logical_type::signature::LogicalType, DFSchemaRef, ToDFSchema, + }; use datafusion_expr::{ function::{ AccumulatorArgs, AggregateFunctionSimplification, @@ -1784,9 +1795,6 @@ mod tests { sync::Arc, }; - use crate::simplify_expressions::SimplifyContext; - use crate::test::test_table_scan_with_name; - use super::*; // ------------------------------ @@ -1822,9 +1830,9 @@ mod tests { } fn test_schema() -> DFSchemaRef { - Schema::new(vec![ - Field::new("i", DataType::Int64, false), - Field::new("b", DataType::Boolean, true), + LogicalPhysicalSchema::new(vec![ + LogicalPhysicalField::new("i", DataType::Int64, false), + LogicalPhysicalField::new("b", DataType::Boolean, true), ]) .to_dfschema_ref() .unwrap() @@ -3007,14 +3015,14 @@ mod tests { Arc::new( DFSchema::from_unqualified_fields( vec![ - Field::new("c1", DataType::Utf8, true), - Field::new("c2", DataType::Boolean, true), - Field::new("c3", DataType::Int64, true), - Field::new("c4", DataType::UInt32, true), - Field::new("c1_non_null", DataType::Utf8, false), - Field::new("c2_non_null", DataType::Boolean, false), - Field::new("c3_non_null", DataType::Int64, false), - Field::new("c4_non_null", DataType::UInt32, false), + LogicalPhysicalField::new("c1", DataType::Utf8, true), + LogicalPhysicalField::new("c2", DataType::Boolean, true), + LogicalPhysicalField::new("c3", DataType::Int64, true), + LogicalPhysicalField::new("c4", DataType::UInt32, true), + LogicalPhysicalField::new("c1_non_null", DataType::Utf8, false), + LogicalPhysicalField::new("c2_non_null", DataType::Boolean, false), + LogicalPhysicalField::new("c3_non_null", DataType::Int64, false), + LogicalPhysicalField::new("c4_non_null", DataType::UInt32, false), ] .into(), HashMap::new(), @@ -3102,7 +3110,10 @@ mod tests { #[test] fn simplify_expr_eq() { let schema = expr_test_schema(); - assert_eq!(col("c2").get_type(&schema).unwrap(), DataType::Boolean); + assert_eq!( + col("c2").get_type(&schema).unwrap().logical(), + &LogicalType::Boolean + ); // true = true -> true assert_eq!(simplify(lit(true).eq(lit(true))), lit(true)); @@ -3126,7 +3137,10 @@ mod tests { // expression to non-boolean. // // Make sure c1 column to be used in tests is not boolean type - assert_eq!(col("c1").get_type(&schema).unwrap(), DataType::Utf8); + assert_eq!( + col("c1").get_type(&schema).unwrap().logical(), + &LogicalType::Utf8 + ); // don't fold c1 = foo assert_eq!(simplify(col("c1").eq(lit("foo"))), col("c1").eq(lit("foo")),); @@ -3136,7 +3150,10 @@ mod tests { fn simplify_expr_not_eq() { let schema = expr_test_schema(); - assert_eq!(col("c2").get_type(&schema).unwrap(), DataType::Boolean); + assert_eq!( + col("c2").get_type(&schema).unwrap().logical(), + &LogicalType::Boolean + ); // c2 != true -> !c2 assert_eq!(simplify(col("c2").not_eq(lit(true))), col("c2").not(),); @@ -3157,7 +3174,10 @@ mod tests { // when one of the operand is not of boolean type, folding the // other boolean constant will change return type of // expression to non-boolean. - assert_eq!(col("c1").get_type(&schema).unwrap(), DataType::Utf8); + assert_eq!( + col("c1").get_type(&schema).unwrap().logical(), + &LogicalType::Utf8 + ); assert_eq!( simplify(col("c1").not_eq(lit("foo"))), diff --git a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs index e650d4c09c23..a083945b6a3e 100644 --- a/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs +++ b/datafusion/optimizer/src/simplify_expressions/simplify_exprs.rs @@ -93,7 +93,7 @@ impl SimplifyExpressions { // projection applied for simplification Arc::new(DFSchema::try_from_qualified_schema( scan.table_name.clone(), - &scan.source.schema(), + &scan.source.schema().as_ref().clone().into(), )?) } else { Arc::new(DFSchema::empty()) @@ -149,10 +149,9 @@ impl SimplifyExpressions { mod tests { use std::ops::Not; + use crate::optimizer::Optimizer; use arrow::datatypes::{DataType, Field, Schema}; use chrono::{DateTime, Utc}; - - use crate::optimizer::Optimizer; use datafusion_expr::logical_plan::builder::table_scan_with_filters; use datafusion_expr::logical_plan::table_scan; use datafusion_expr::{ @@ -703,9 +702,10 @@ mod tests { let t1 = test_table_scan_with_name("t1")?; let t2 = test_table_scan_with_name("t2")?; - let left_key = col("t1.a") + lit(1i64).cast_to(&DataType::UInt32, t1.schema())?; + let left_key = + col("t1.a") + lit(1i64).cast_to(&DataType::UInt32.into(), t1.schema())?; let right_key = - col("t2.a") + lit(2i64).cast_to(&DataType::UInt32, t2.schema())?; + col("t2.a") + lit(2i64).cast_to(&DataType::UInt32.into(), t2.schema())?; let plan = LogicalPlanBuilder::from(t1) .join_with_expr_keys( t2, diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs index 2c7e8644026e..dbabdb5c926f 100644 --- a/datafusion/optimizer/src/test/mod.rs +++ b/datafusion/optimizer/src/test/mod.rs @@ -59,7 +59,7 @@ pub fn assert_fields_eq(plan: &LogicalPlan, expected: Vec<&str>) { .schema() .fields() .iter() - .map(|f| f.name().clone()) + .map(|f| f.name().to_string()) .collect(); assert_eq!(actual, expected); } diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index de471d59c466..f33ef5767afb 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -29,6 +29,8 @@ use arrow::datatypes::{ DataType, TimeUnit, MAX_DECIMAL_FOR_EACH_PRECISION, MIN_DECIMAL_FOR_EACH_PRECISION, }; use arrow::temporal_conversions::{MICROSECONDS, MILLISECONDS, NANOSECONDS}; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::tree_node::{Transformed, TreeNode, TreeNodeRewriter}; use datafusion_common::{internal_err, DFSchema, DFSchemaRef, Result, ScalarValue}; use datafusion_expr::expr::{BinaryExpr, Cast, InList, TryCast}; @@ -104,7 +106,7 @@ impl OptimizerRule for UnwrapCastInComparison { if let LogicalPlan::TableScan(ts) = &plan { let source_schema = DFSchema::try_from_qualified_schema( ts.table_name.clone(), - &ts.source.schema(), + &ts.source.schema().as_ref().clone().into(), )?; schema.merge(&source_schema); } @@ -275,94 +277,86 @@ fn is_comparison_op(op: &Operator) -> bool { } /// Returns true if [UnwrapCastExprRewriter] supports this data type -fn is_supported_type(data_type: &DataType) -> bool { - is_supported_numeric_type(data_type) - || is_supported_string_type(data_type) - || is_supported_dictionary_type(data_type) +fn is_supported_type(data_type: &LogicalPhysicalType) -> bool { + is_supported_numeric_type(data_type) || is_supported_string_type(data_type) } /// Returns true if [[UnwrapCastExprRewriter]] suppors this numeric type -fn is_supported_numeric_type(data_type: &DataType) -> bool { +fn is_supported_numeric_type(data_type: &LogicalPhysicalType) -> bool { + use LogicalType::*; matches!( - data_type, - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::Decimal128(_, _) - | DataType::Timestamp(_, _) + data_type.logical(), + UInt8 + | UInt16 + | UInt32 + | UInt64 + | Int8 + | Int16 + | Int32 + | Int64 + | Decimal128(_, _) + | Timestamp(_, _) ) } /// Returns true if [UnwrapCastExprRewriter] supports casting this value as a string -fn is_supported_string_type(data_type: &DataType) -> bool { - matches!(data_type, DataType::Utf8 | DataType::LargeUtf8) -} - -/// Returns true if [UnwrapCastExprRewriter] supports casting this value as a dictionary -fn is_supported_dictionary_type(data_type: &DataType) -> bool { - matches!(data_type, - DataType::Dictionary(_, inner) if is_supported_type(inner)) +fn is_supported_string_type(data_type: &LogicalPhysicalType) -> bool { + matches!(data_type.logical(), LogicalType::Utf8) } /// Convert a literal value from one data type to another fn try_cast_literal_to_type( lit_value: &ScalarValue, - target_type: &DataType, + target_type: &LogicalPhysicalType, ) -> Option { - let lit_data_type = lit_value.data_type(); + let lit_data_type = lit_value.data_type().into(); if !is_supported_type(&lit_data_type) || !is_supported_type(target_type) { return None; } if lit_value.is_null() { // null value can be cast to any type of null value - return ScalarValue::try_from(target_type).ok(); + return ScalarValue::try_from(target_type.physical().clone()).ok(); } try_cast_numeric_literal(lit_value, target_type) .or_else(|| try_cast_string_literal(lit_value, target_type)) - .or_else(|| try_cast_dictionary(lit_value, target_type)) } /// Convert a numeric value from one numeric data type to another fn try_cast_numeric_literal( lit_value: &ScalarValue, - target_type: &DataType, + target_type: &LogicalPhysicalType, ) -> Option { - let lit_data_type = lit_value.data_type(); + let lit_data_type = lit_value.data_type().into(); if !is_supported_numeric_type(&lit_data_type) || !is_supported_numeric_type(target_type) { return None; } - let mul = match target_type { - DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 - | DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 => 1_i128, - DataType::Timestamp(_, _) => 1_i128, - DataType::Decimal128(_, scale) => 10_i128.pow(*scale as u32), + let mul = match target_type.logical() { + LogicalType::UInt8 + | LogicalType::UInt16 + | LogicalType::UInt32 + | LogicalType::UInt64 + | LogicalType::Int8 + | LogicalType::Int16 + | LogicalType::Int32 + | LogicalType::Int64 => 1_i128, + LogicalType::Timestamp(_, _) => 1_i128, + LogicalType::Decimal128(_, scale) => 10_i128.pow(*scale as u32), _ => return None, }; - let (target_min, target_max) = match target_type { - DataType::UInt8 => (u8::MIN as i128, u8::MAX as i128), - DataType::UInt16 => (u16::MIN as i128, u16::MAX as i128), - DataType::UInt32 => (u32::MIN as i128, u32::MAX as i128), - DataType::UInt64 => (u64::MIN as i128, u64::MAX as i128), - DataType::Int8 => (i8::MIN as i128, i8::MAX as i128), - DataType::Int16 => (i16::MIN as i128, i16::MAX as i128), - DataType::Int32 => (i32::MIN as i128, i32::MAX as i128), - DataType::Int64 => (i64::MIN as i128, i64::MAX as i128), - DataType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128), - DataType::Decimal128(precision, _) => ( + let (target_min, target_max) = match target_type.logical() { + LogicalType::UInt8 => (u8::MIN as i128, u8::MAX as i128), + LogicalType::UInt16 => (u16::MIN as i128, u16::MAX as i128), + LogicalType::UInt32 => (u32::MIN as i128, u32::MAX as i128), + LogicalType::UInt64 => (u64::MIN as i128, u64::MAX as i128), + LogicalType::Int8 => (i8::MIN as i128, i8::MAX as i128), + LogicalType::Int16 => (i16::MIN as i128, i16::MAX as i128), + LogicalType::Int32 => (i32::MIN as i128, i32::MAX as i128), + LogicalType::Int64 => (i64::MIN as i128, i64::MAX as i128), + LogicalType::Timestamp(_, _) => (i64::MIN as i128, i64::MAX as i128), + LogicalType::Decimal128(precision, _) => ( // Different precision for decimal128 can store different range of value. // For example, the precision is 3, the max of value is `999` and the min // value is `-999` @@ -412,16 +406,16 @@ fn try_cast_numeric_literal( if value >= target_min && value <= target_max { // the value casted from lit to the target type is in the range of target type. // return the target type of scalar value - let result_scalar = match target_type { - DataType::Int8 => ScalarValue::Int8(Some(value as i8)), - DataType::Int16 => ScalarValue::Int16(Some(value as i16)), - DataType::Int32 => ScalarValue::Int32(Some(value as i32)), - DataType::Int64 => ScalarValue::Int64(Some(value as i64)), - DataType::UInt8 => ScalarValue::UInt8(Some(value as u8)), - DataType::UInt16 => ScalarValue::UInt16(Some(value as u16)), - DataType::UInt32 => ScalarValue::UInt32(Some(value as u32)), - DataType::UInt64 => ScalarValue::UInt64(Some(value as u64)), - DataType::Timestamp(TimeUnit::Second, tz) => { + let result_scalar = match target_type.logical() { + LogicalType::Int8 => ScalarValue::Int8(Some(value as i8)), + LogicalType::Int16 => ScalarValue::Int16(Some(value as i16)), + LogicalType::Int32 => ScalarValue::Int32(Some(value as i32)), + LogicalType::Int64 => ScalarValue::Int64(Some(value as i64)), + LogicalType::UInt8 => ScalarValue::UInt8(Some(value as u8)), + LogicalType::UInt16 => ScalarValue::UInt16(Some(value as u16)), + LogicalType::UInt32 => ScalarValue::UInt32(Some(value as u32)), + LogicalType::UInt64 => ScalarValue::UInt64(Some(value as u64)), + LogicalType::Timestamp(TimeUnit::Second, tz) => { let value = cast_between_timestamp( lit_data_type, DataType::Timestamp(TimeUnit::Second, tz.clone()), @@ -429,7 +423,7 @@ fn try_cast_numeric_literal( ); ScalarValue::TimestampSecond(value, tz.clone()) } - DataType::Timestamp(TimeUnit::Millisecond, tz) => { + LogicalType::Timestamp(TimeUnit::Millisecond, tz) => { let value = cast_between_timestamp( lit_data_type, DataType::Timestamp(TimeUnit::Millisecond, tz.clone()), @@ -437,7 +431,7 @@ fn try_cast_numeric_literal( ); ScalarValue::TimestampMillisecond(value, tz.clone()) } - DataType::Timestamp(TimeUnit::Microsecond, tz) => { + LogicalType::Timestamp(TimeUnit::Microsecond, tz) => { let value = cast_between_timestamp( lit_data_type, DataType::Timestamp(TimeUnit::Microsecond, tz.clone()), @@ -445,7 +439,7 @@ fn try_cast_numeric_literal( ); ScalarValue::TimestampMicrosecond(value, tz.clone()) } - DataType::Timestamp(TimeUnit::Nanosecond, tz) => { + LogicalType::Timestamp(TimeUnit::Nanosecond, tz) => { let value = cast_between_timestamp( lit_data_type, DataType::Timestamp(TimeUnit::Nanosecond, tz.clone()), @@ -453,7 +447,7 @@ fn try_cast_numeric_literal( ); ScalarValue::TimestampNanosecond(value, tz.clone()) } - DataType::Decimal128(p, s) => { + LogicalType::Decimal128(p, s) => { ScalarValue::Decimal128(Some(value), *p, *s) } _ => { @@ -470,62 +464,39 @@ fn try_cast_numeric_literal( fn try_cast_string_literal( lit_value: &ScalarValue, - target_type: &DataType, + target_type: &LogicalPhysicalType, ) -> Option { let string_value = match lit_value { ScalarValue::Utf8(s) | ScalarValue::LargeUtf8(s) => s.clone(), _ => return None, }; - let scalar_value = match target_type { - DataType::Utf8 => ScalarValue::Utf8(string_value), - DataType::LargeUtf8 => ScalarValue::LargeUtf8(string_value), + let scalar_value = match target_type.logical() { + LogicalType::Utf8 => ScalarValue::Utf8(string_value), _ => return None, }; Some(scalar_value) } -/// Attempt to cast to/from a dictionary type by wrapping/unwrapping the dictionary -fn try_cast_dictionary( - lit_value: &ScalarValue, - target_type: &DataType, -) -> Option { - let lit_value_type = lit_value.data_type(); - let result_scalar = match (lit_value, target_type) { - // Unwrap dictionary when inner type matches target type - (ScalarValue::Dictionary(_, inner_value), _) - if inner_value.data_type() == *target_type => - { - (**inner_value).clone() - } - // Wrap type when target type is dictionary - (_, DataType::Dictionary(index_type, inner_type)) - if **inner_type == lit_value_type => - { - ScalarValue::Dictionary(index_type.clone(), Box::new(lit_value.clone())) - } - _ => { - return None; - } - }; - Some(result_scalar) -} - /// Cast a timestamp value from one unit to another -fn cast_between_timestamp(from: DataType, to: DataType, value: i128) -> Option { +fn cast_between_timestamp( + from: impl Into, + to: impl Into, + value: i128, +) -> Option { let value = value as i64; - let from_scale = match from { - DataType::Timestamp(TimeUnit::Second, _) => 1, - DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS, - DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS, - DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS, + let from_scale = match from.into().logical() { + LogicalType::Timestamp(TimeUnit::Second, _) => 1, + LogicalType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS, + LogicalType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS, + LogicalType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS, _ => return Some(value), }; - let to_scale = match to { - DataType::Timestamp(TimeUnit::Second, _) => 1, - DataType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS, - DataType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS, - DataType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS, + let to_scale = match to.into().logical() { + LogicalType::Timestamp(TimeUnit::Second, _) => 1, + LogicalType::Timestamp(TimeUnit::Millisecond, _) => MILLISECONDS, + LogicalType::Timestamp(TimeUnit::Microsecond, _) => MICROSECONDS, + LogicalType::Timestamp(TimeUnit::Nanosecond, _) => NANOSECONDS, _ => return Some(value), }; @@ -542,8 +513,11 @@ mod tests { use super::*; - use arrow::compute::{cast_with_options, CastOptions}; - use arrow::datatypes::Field; + use arrow::{ + compute::{cast_with_options, CastOptions}, + datatypes::DataType, + }; + use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_common::tree_node::TransformedResult; use datafusion_expr::{cast, col, in_list, try_cast}; @@ -600,45 +574,6 @@ mod tests { assert_eq!(optimize_test(expr_input, &schema), expected); } - #[test] - fn test_unwrap_cast_comparison_string() { - let schema = expr_test_schema(); - let dict = ScalarValue::Dictionary( - Box::new(DataType::Int32), - Box::new(ScalarValue::from("value")), - ); - - // cast(str1 as Dictionary) = arrow_cast('value', 'Dictionary') => str1 = Utf8('value1') - let expr_input = cast(col("str1"), dict.data_type()).eq(lit(dict.clone())); - let expected = col("str1").eq(lit("value")); - assert_eq!(optimize_test(expr_input, &schema), expected); - - // cast(tag as Utf8) = Utf8('value') => tag = arrow_cast('value', 'Dictionary') - let expr_input = cast(col("tag"), DataType::Utf8).eq(lit("value")); - let expected = col("tag").eq(lit(dict.clone())); - assert_eq!(optimize_test(expr_input, &schema), expected); - - // Verify reversed argument order - // arrow_cast('value', 'Dictionary') = cast(str1 as Dictionary) => Utf8('value1') = str1 - let expr_input = lit(dict.clone()).eq(cast(col("str1"), dict.data_type())); - let expected = lit("value").eq(col("str1")); - assert_eq!(optimize_test(expr_input, &schema), expected); - } - - #[test] - fn test_unwrap_cast_comparison_large_string() { - let schema = expr_test_schema(); - // cast(largestr as Dictionary) = arrow_cast('value', 'Dictionary') => str1 = LargeUtf8('value1') - let dict = ScalarValue::Dictionary( - Box::new(DataType::Int32), - Box::new(ScalarValue::LargeUtf8(Some("value".to_owned()))), - ); - let expr_input = cast(col("largestr"), dict.data_type()).eq(lit(dict.clone())); - let expected = - col("largestr").eq(lit(ScalarValue::LargeUtf8(Some("value".to_owned())))); - assert_eq!(optimize_test(expr_input, &schema), expected); - } - #[test] fn test_not_unwrap_cast_with_decimal_comparison() { let schema = expr_test_schema(); @@ -841,17 +776,16 @@ mod tests { Arc::new( DFSchema::from_unqualified_fields( vec![ - Field::new("c1", DataType::Int32, false), - Field::new("c2", DataType::Int64, false), - Field::new("c3", DataType::Decimal128(18, 2), false), - Field::new("c4", DataType::Decimal128(38, 37), false), - Field::new("c5", DataType::Float32, false), - Field::new("c6", DataType::UInt32, false), - Field::new("ts_nano_none", timestamp_nano_none_type(), false), - Field::new("ts_nano_utf", timestamp_nano_utc_type(), false), - Field::new("str1", DataType::Utf8, false), - Field::new("largestr", DataType::LargeUtf8, false), - Field::new("tag", dictionary_tag_type(), false), + LogicalPhysicalField::new("c1", DataType::Int32, false), + LogicalPhysicalField::new("c2", DataType::Int64, false), + LogicalPhysicalField::new("c3", DataType::Decimal128(18, 2), false), + LogicalPhysicalField::new("c4", DataType::Decimal128(38, 37), false), + LogicalPhysicalField::new("c5", DataType::Float32, false), + LogicalPhysicalField::new("c6", DataType::UInt32, false), + LogicalPhysicalField::new("ts_nano_none", timestamp_nano_none_type(), false), + LogicalPhysicalField::new("ts_nano_utf", timestamp_nano_utc_type(), false), + LogicalPhysicalField::new("str1", DataType::Utf8, false), + LogicalPhysicalField::new("largestr", DataType::LargeUtf8, false), ] .into(), HashMap::new(), @@ -889,19 +823,14 @@ mod tests { lit(ScalarValue::Decimal128(None, precision, scale)) } - fn timestamp_nano_none_type() -> DataType { - DataType::Timestamp(TimeUnit::Nanosecond, None) + fn timestamp_nano_none_type() -> LogicalPhysicalType { + DataType::Timestamp(TimeUnit::Nanosecond, None).into() } // this is the type that now() returns - fn timestamp_nano_utc_type() -> DataType { + fn timestamp_nano_utc_type() -> LogicalPhysicalType { let utc = Some("+0:00".into()); - DataType::Timestamp(TimeUnit::Nanosecond, utc) - } - - // a dictonary type for storing string tags - fn dictionary_tag_type() -> DataType { - DataType::Dictionary(Box::new(DataType::Int32), Box::new(DataType::Utf8)) + DataType::Timestamp(TimeUnit::Nanosecond, utc).into() } #[test] @@ -1084,11 +1013,11 @@ mod tests { // so double check it here assert_eq!(lit_tz_none, lit_tz_utc); - // e.g. DataType::Timestamp(_, None) - let dt_tz_none = lit_tz_none.data_type(); + // e.g. LogicalType::Timestamp(_, None) + let dt_tz_none: LogicalPhysicalType = lit_tz_none.data_type().into(); - // e.g. DataType::Timestamp(_, Some(utc)) - let dt_tz_utc = lit_tz_utc.data_type(); + // e.g. LogicalType::Timestamp(_, Some(utc)) + let dt_tz_utc: LogicalPhysicalType = lit_tz_utc.data_type().into(); // None <--> None expect_cast( @@ -1153,7 +1082,7 @@ mod tests { // int64 to list expect_cast( ScalarValue::Int64(Some(12345)), - DataType::List(Arc::new(Field::new("f", DataType::Int32, true))), + DataType::new_list(DataType::Int32, true), ExpectedCast::NoValue, ); } @@ -1171,9 +1100,10 @@ mod tests { /// casting is consistent with the Arrow kernels fn expect_cast( literal: ScalarValue, - target_type: DataType, + target_type: impl Into, expected_result: ExpectedCast, ) { + let target_type = target_type.into(); let actual_value = try_cast_literal_to_type(&literal, &target_type); println!("expect_cast: "); @@ -1199,7 +1129,7 @@ mod tests { .expect("Failed to convert to array of size"); let cast_array = cast_with_options( &literal_array, - &target_type, + target_type.physical(), &CastOptions::default(), ) .expect("Expected to be cast array with arrow cast kernel"); @@ -1214,8 +1144,10 @@ mod tests { if let ( DataType::Timestamp(left_unit, left_tz), DataType::Timestamp(right_unit, right_tz), - ) = (actual_value.data_type(), expected_value.data_type()) - { + ) = ( + actual_value.data_type().into(), + expected_value.data_type().into(), + ) { assert_eq!(left_unit, right_unit); assert_eq!(left_tz, right_tz); } @@ -1234,7 +1166,7 @@ mod tests { // same timestamp let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampNanosecond(Some(123456), None), - &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DataType::Timestamp(TimeUnit::Nanosecond, None).into(), ) .unwrap(); @@ -1246,7 +1178,7 @@ mod tests { // TimestampNanosecond to TimestampMicrosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampNanosecond(Some(123456), None), - &DataType::Timestamp(TimeUnit::Microsecond, None), + &DataType::Timestamp(TimeUnit::Microsecond, None).into(), ) .unwrap(); @@ -1258,7 +1190,7 @@ mod tests { // TimestampNanosecond to TimestampMillisecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampNanosecond(Some(123456), None), - &DataType::Timestamp(TimeUnit::Millisecond, None), + &DataType::Timestamp(TimeUnit::Millisecond, None).into(), ) .unwrap(); @@ -1267,7 +1199,7 @@ mod tests { // TimestampNanosecond to TimestampSecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampNanosecond(Some(123456), None), - &DataType::Timestamp(TimeUnit::Second, None), + &DataType::Timestamp(TimeUnit::Second, None).into(), ) .unwrap(); @@ -1276,7 +1208,7 @@ mod tests { // TimestampMicrosecond to TimestampNanosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMicrosecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DataType::Timestamp(TimeUnit::Nanosecond, None).into(), ) .unwrap(); @@ -1288,7 +1220,7 @@ mod tests { // TimestampMicrosecond to TimestampMillisecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMicrosecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Millisecond, None), + &DataType::Timestamp(TimeUnit::Millisecond, None).into(), ) .unwrap(); @@ -1297,7 +1229,7 @@ mod tests { // TimestampMicrosecond to TimestampSecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMicrosecond(Some(123456789), None), - &DataType::Timestamp(TimeUnit::Second, None), + &DataType::Timestamp(TimeUnit::Second, None).into(), ) .unwrap(); assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123), None)); @@ -1305,7 +1237,7 @@ mod tests { // TimestampMillisecond to TimestampNanosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMillisecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DataType::Timestamp(TimeUnit::Nanosecond, None).into(), ) .unwrap(); assert_eq!( @@ -1316,7 +1248,7 @@ mod tests { // TimestampMillisecond to TimestampMicrosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMillisecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Microsecond, None), + &DataType::Timestamp(TimeUnit::Microsecond, None).into(), ) .unwrap(); assert_eq!( @@ -1326,7 +1258,7 @@ mod tests { // TimestampMillisecond to TimestampSecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampMillisecond(Some(123456789), None), - &DataType::Timestamp(TimeUnit::Second, None), + &DataType::Timestamp(TimeUnit::Second, None).into(), ) .unwrap(); assert_eq!(new_scalar, ScalarValue::TimestampSecond(Some(123456), None)); @@ -1334,7 +1266,7 @@ mod tests { // TimestampSecond to TimestampNanosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampSecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Nanosecond, None), + &DataType::Timestamp(TimeUnit::Nanosecond, None).into(), ) .unwrap(); assert_eq!( @@ -1345,7 +1277,7 @@ mod tests { // TimestampSecond to TimestampMicrosecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampSecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Microsecond, None), + &DataType::Timestamp(TimeUnit::Microsecond, None).into(), ) .unwrap(); assert_eq!( @@ -1356,7 +1288,7 @@ mod tests { // TimestampSecond to TimestampMillisecond let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampSecond(Some(123), None), - &DataType::Timestamp(TimeUnit::Millisecond, None), + &DataType::Timestamp(TimeUnit::Millisecond, None).into(), ) .unwrap(); assert_eq!( @@ -1367,7 +1299,7 @@ mod tests { // overflow let new_scalar = try_cast_literal_to_type( &ScalarValue::TimestampSecond(Some(i64::MAX), None), - &DataType::Timestamp(TimeUnit::Millisecond, None), + &DataType::Timestamp(TimeUnit::Millisecond, None).into(), ) .unwrap(); assert_eq!(new_scalar, ScalarValue::TimestampMillisecond(None, None)); @@ -1388,29 +1320,4 @@ mod tests { } } } - #[test] - fn test_try_cast_to_dictionary_type() { - fn dictionary_type(t: DataType) -> DataType { - DataType::Dictionary(Box::new(DataType::Int32), Box::new(t)) - } - fn dictionary_value(value: ScalarValue) -> ScalarValue { - ScalarValue::Dictionary(Box::new(DataType::Int32), Box::new(value)) - } - let scalars = vec![ - ScalarValue::from("string"), - ScalarValue::LargeUtf8(Some("string".to_owned())), - ]; - for s in &scalars { - expect_cast( - s.clone(), - dictionary_type(s.data_type()), - ExpectedCast::Value(dictionary_value(s.clone())), - ); - expect_cast( - dictionary_value(s.clone()), - s.data_type(), - ExpectedCast::Value(s.clone()), - ) - } - } } diff --git a/datafusion/optimizer/tests/optimizer_integration.rs b/datafusion/optimizer/tests/optimizer_integration.rs index c0863839dba1..ae2b15165149 100644 --- a/datafusion/optimizer/tests/optimizer_integration.rs +++ b/datafusion/optimizer/tests/optimizer_integration.rs @@ -22,6 +22,8 @@ use std::sync::Arc; use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; use datafusion_common::config::ConfigOptions; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{plan_err, Result}; use datafusion_expr::test::function_stub::sum_udaf; use datafusion_expr::{AggregateUDF, LogicalPlan, ScalarUDF, TableSource, WindowUDF}; @@ -414,7 +416,7 @@ impl ContextProvider for MyContextProvider { self.udafs.get(name).cloned() } - fn get_variable_type(&self, _variable_names: &[String]) -> Option { + fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } @@ -448,7 +450,7 @@ impl TableSource for MyTableSource { self } - fn schema(&self) -> SchemaRef { - self.schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.schema.clone().into()) } } diff --git a/datafusion/physical-expr-common/src/utils.rs b/datafusion/physical-expr-common/src/utils.rs index 44622bd309df..8da8dc931157 100644 --- a/datafusion/physical-expr-common/src/utils.rs +++ b/datafusion/physical-expr-common/src/utils.rs @@ -21,6 +21,7 @@ use arrow::array::{make_array, Array, ArrayRef, BooleanArray, MutableArrayData}; use arrow::compute::{and_kleene, is_not_null, SlicesIterator}; use arrow::datatypes::Schema; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{exec_err, Result}; use datafusion_expr::expr::Alias; use datafusion_expr::sort_properties::ExprProperties; @@ -124,7 +125,8 @@ pub fn limited_convert_logical_expr_to_physical_expr( cast_expr.expr.as_ref(), schema, )?, - cast_expr.data_type.clone(), + // TODO(@notfilippo): do not convert to physical type + cast_expr.data_type.physical().clone(), None, ))), Expr::Literal(value) => Ok(Arc::new(Literal::new(value.clone()))), diff --git a/datafusion/physical-expr/src/expressions/in_list.rs b/datafusion/physical-expr/src/expressions/in_list.rs index 8a3885030b9d..37017bfe26f1 100644 --- a/datafusion/physical-expr/src/expressions/in_list.rs +++ b/datafusion/physical-expr/src/expressions/in_list.rs @@ -429,7 +429,10 @@ pub fn in_list( let expr_data_type = expr.data_type(schema)?; for list_expr in list.iter() { let list_expr_data_type = list_expr.data_type(schema)?; - if !DFSchema::datatype_is_logically_equal(&expr_data_type, &list_expr_data_type) { + if !DFSchema::datatype_is_logically_equal( + &expr_data_type.clone().into(), + &list_expr_data_type.clone().into(), + ) { return internal_err!( "The data type inlist should be same, the value type is {expr_data_type}, one of list expr type is {list_expr_data_type}" ); diff --git a/datafusion/physical-expr/src/expressions/negative.rs b/datafusion/physical-expr/src/expressions/negative.rs index b5ebc250cb89..71b5ca16f3b9 100644 --- a/datafusion/physical-expr/src/expressions/negative.rs +++ b/datafusion/physical-expr/src/expressions/negative.rs @@ -160,7 +160,7 @@ pub fn negative( arg: Arc, input_schema: &Schema, ) -> Result> { - let data_type = arg.data_type(input_schema)?; + let data_type = arg.data_type(input_schema)?.into(); if is_null(&data_type) { Ok(arg) } else if !is_signed_numeric(&data_type) diff --git a/datafusion/physical-expr/src/planner.rs b/datafusion/physical-expr/src/planner.rs index a975f0c6ef83..713325a31306 100644 --- a/datafusion/physical-expr/src/planner.rs +++ b/datafusion/physical-expr/src/planner.rs @@ -24,6 +24,8 @@ use crate::{ }; use arrow::datatypes::Schema; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{ exec_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, ToDFSchema, }; @@ -259,12 +261,12 @@ pub fn create_physical_expr( Expr::Cast(Cast { expr, data_type }) => expressions::cast( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, - data_type.clone(), + data_type.physical().clone(), ), Expr::TryCast(TryCast { expr, data_type }) => expressions::try_cast( create_physical_expr(expr, input_dfschema, execution_props)?, input_schema, - data_type.clone(), + data_type.physical().clone(), ), Expr::Not(expr) => { expressions::not(create_physical_expr(expr, input_dfschema, execution_props)?) @@ -369,7 +371,7 @@ where /// Convert a logical expression to a physical expression (without any simplification, etc) pub fn logical2physical(expr: &Expr, schema: &Schema) -> Arc { - let df_schema = schema.clone().to_dfschema().unwrap(); + let df_schema = LogicalPhysicalSchema::from(schema.clone()).to_dfschema().unwrap(); let execution_props = ExecutionProps::new(); create_physical_expr(expr, &df_schema, &execution_props).unwrap() } @@ -388,7 +390,8 @@ mod tests { let expr = col("letter").eq(lit("A")); let schema = Schema::new(vec![Field::new("letter", DataType::Utf8, false)]); - let df_schema = DFSchema::try_from_qualified_schema("data", &schema)?; + let df_schema = + DFSchema::try_from_qualified_schema("data", &schema.clone().into())?; let p = create_physical_expr(&expr, &df_schema, &ExecutionProps::new())?; let batch = RecordBatch::try_new( diff --git a/datafusion/physical-expr/src/utils/guarantee.rs b/datafusion/physical-expr/src/utils/guarantee.rs index 42e5e6fcf3ac..2288cf506a95 100644 --- a/datafusion/physical-expr/src/utils/guarantee.rs +++ b/datafusion/physical-expr/src/utils/guarantee.rs @@ -419,12 +419,11 @@ impl<'a> ColOpLit<'a> { #[cfg(test)] mod test { - use std::sync::OnceLock; - use super::*; use crate::planner::logical2physical; - use arrow_schema::{DataType, Field, Schema, SchemaRef}; + use std::sync::OnceLock; + use datafusion_expr::expr_fn::*; use datafusion_expr::{lit, Expr}; @@ -835,7 +834,7 @@ mod test { fn test_analyze(expr: Expr, expected: Vec) { println!("Begin analyze of {expr}"); let schema = schema(); - let physical_expr = logical2physical(&expr, &schema); + let physical_expr = logical2physical(&expr, &schema.as_ref().clone().into()); let actual = LiteralGuarantee::analyze(&physical_expr); assert_eq!( diff --git a/datafusion/proto-common/src/from_proto/mod.rs b/datafusion/proto-common/src/from_proto/mod.rs index df673de4e119..9006f4bd1357 100644 --- a/datafusion/proto-common/src/from_proto/mod.rs +++ b/datafusion/proto-common/src/from_proto/mod.rs @@ -30,6 +30,8 @@ use arrow::datatypes::{ }; use arrow::ipc::{reader::read_record_batch, root_as_message}; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{ arrow_datafusion_err, config::{ @@ -158,19 +160,21 @@ impl TryFrom<&protobuf::DfSchema> for DFSchema { df_schema: &protobuf::DfSchema, ) -> datafusion_common::Result { let df_fields = df_schema.columns.clone(); - let qualifiers_and_fields: Vec<(Option, Arc)> = df_fields - .iter() - .map(|df_field| { - let field: Field = df_field.field.as_ref().required("field")?; - Ok(( - df_field - .qualifier - .as_ref() - .map(|q| q.relation.clone().into()), - Arc::new(field), - )) - }) - .collect::, Error>>()?; + let qualifiers_and_fields: Vec<(Option, Arc)> = + df_fields + .iter() + .map(|df_field| { + let field: LogicalPhysicalField = + df_field.field.as_ref().required("field")?; + Ok(( + df_field + .qualifier + .as_ref() + .map(|q| q.relation.clone().into()), + Arc::new(field), + )) + }) + .collect::, Error>>()?; Ok(DFSchema::new_with_metadata( qualifiers_and_fields, @@ -190,6 +194,16 @@ impl TryFrom for DFSchemaRef { } } +impl TryFrom<&protobuf::ArrowType> for LogicalPhysicalType { + type Error = Error; + + fn try_from( + arrow_type: &protobuf::ArrowType, + ) -> datafusion_common::Result { + DataType::try_from(arrow_type).map(|t| t.into()) + } +} + impl TryFrom<&protobuf::ArrowType> for DataType { type Error = Error; @@ -332,6 +346,13 @@ impl TryFrom<&protobuf::Field> for Field { } } +impl TryFrom<&protobuf::Field> for LogicalPhysicalField { + type Error = Error; + fn try_from(field: &protobuf::Field) -> Result { + Field::try_from(field).map(|t| t.into()) + } +} + impl TryFrom<&protobuf::Schema> for Schema { type Error = Error; diff --git a/datafusion/proto-common/src/to_proto/mod.rs b/datafusion/proto-common/src/to_proto/mod.rs index 705a479e0178..eb0ad074bbfe 100644 --- a/datafusion/proto-common/src/to_proto/mod.rs +++ b/datafusion/proto-common/src/to_proto/mod.rs @@ -28,6 +28,7 @@ use arrow::datatypes::{ SchemaRef, TimeUnit, UnionMode, }; use arrow::ipc::writer::{DictionaryTracker, IpcDataGenerator}; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::{ config::{ ColumnOptions, CsvOptions, JsonOptions, ParquetOptions, TableParquetOptions, @@ -112,6 +113,17 @@ impl TryFrom<&DataType> for protobuf::ArrowType { } } +impl TryFrom<&LogicalPhysicalType> for protobuf::ArrowType { + type Error = Error; + + fn try_from(val: &LogicalPhysicalType) -> Result { + let arrow_type_enum: ArrowTypeEnum = val.physical().try_into()?; + Ok(Self { + arrow_type_enum: Some(arrow_type_enum), + }) + } +} + impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { type Error = Error; @@ -262,8 +274,9 @@ impl TryFrom<&DFSchema> for protobuf::DfSchema { let columns = s .iter() .map(|(qualifier, field)| { + let field: Field = field.as_ref().clone().into(); Ok(protobuf::DfField { - field: Some(field.as_ref().try_into()?), + field: Some((&field).try_into()?), qualifier: qualifier.map(|r| protobuf::ColumnRelation { relation: r.to_string(), }), diff --git a/datafusion/proto/src/logical_plan/from_proto.rs b/datafusion/proto/src/logical_plan/from_proto.rs index 095c6a50973a..4fdd9e6c4424 100644 --- a/datafusion/proto/src/logical_plan/from_proto.rs +++ b/datafusion/proto/src/logical_plan/from_proto.rs @@ -18,6 +18,7 @@ use std::sync::Arc; use datafusion::execution::registry::FunctionRegistry; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{ internal_err, plan_datafusion_err, DataFusionError, Result, ScalarValue, TableReference, UnnestOptions, @@ -551,7 +552,8 @@ pub fn parse_expr( "expr", codec, )?); - let data_type = cast.arrow_type.as_ref().required("arrow_type")?; + let data_type: LogicalPhysicalType = + cast.arrow_type.as_ref().required("arrow_type")?; Ok(Expr::Cast(Cast::new(expr, data_type))) } ExprType::TryCast(cast) => { @@ -561,7 +563,8 @@ pub fn parse_expr( "expr", codec, )?); - let data_type = cast.arrow_type.as_ref().required("arrow_type")?; + let data_type: LogicalPhysicalType = + cast.arrow_type.as_ref().required("arrow_type")?; Ok(Expr::TryCast(TryCast::new(expr, data_type))) } ExprType::Sort(sort) => Ok(Expr::Sort(Sort::new( diff --git a/datafusion/proto/src/logical_plan/mod.rs b/datafusion/proto/src/logical_plan/mod.rs index 664cd7e11555..3fe2baca2ea9 100644 --- a/datafusion/proto/src/logical_plan/mod.rs +++ b/datafusion/proto/src/logical_plan/mod.rs @@ -63,11 +63,12 @@ use datafusion_expr::{ DistinctOn, DropView, Expr, LogicalPlan, LogicalPlanBuilder, ScalarUDF, }; +use self::to_proto::serialize_expr; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; +use datafusion_proto_common::ArrowType; use prost::bytes::BufMut; use prost::Message; -use self::to_proto::serialize_expr; - pub mod file_formats; pub mod from_proto; pub mod to_proto; @@ -836,10 +837,10 @@ impl AsLogicalPlan for LogicalPlanNode { LogicalPlanType::Prepare(prepare) => { let input: LogicalPlan = into_logical_plan!(prepare.input, ctx, extension_codec)?; - let data_types: Vec = prepare + let data_types: Vec = prepare .data_types .iter() - .map(DataType::try_from) + .map(|t| DataType::try_from(t).map(|t| t.into())) .collect::>()?; LogicalPlanBuilder::from(input) .prepare(prepare.name.clone(), data_types)? @@ -934,7 +935,7 @@ impl AsLogicalPlan for LogicalPlanNode { .. }) => { let provider = source_as_provider(source)?; - let schema = provider.schema(); + let schema = SchemaRef::new(provider.schema().as_ref().clone().into()); let source = provider.as_any(); let projection = match projection { @@ -1560,8 +1561,8 @@ impl AsLogicalPlan for LogicalPlanNode { name: name.clone(), data_types: data_types .iter() - .map(|t| t.try_into()) - .collect::, _>>()?, + .map(|t| t.physical().try_into()) + .collect::, _>>()?, input: Some(Box::new(input)), }, ))), diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index f764a050a6cd..ec9e014029a5 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -32,6 +32,7 @@ use datafusion::datasource::file_format::arrow::ArrowFormatFactory; use datafusion::datasource::file_format::csv::CsvFormatFactory; use datafusion::datasource::file_format::format_as_file_type; use datafusion::datasource::file_format::parquet::ParquetFormatFactory; +use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_proto::logical_plan::file_formats::{ ArrowLogicalExtensionCodec, CsvLogicalExtensionCodec, ParquetLogicalExtensionCodec, }; @@ -592,7 +593,10 @@ async fn roundtrip_expr_api() -> Result<()> { // list of expressions to round trip let expr_list = vec![ - encode(col("a").cast_to(&DataType::Utf8, &schema)?, lit("hex")), + encode( + col("a").cast_to(&DataType::Utf8.into(), &schema)?, + lit("hex"), + ), decode(lit("1234"), lit("hex")), array_to_string(make_array(vec![lit(1), lit(2), lit(3)]), lit(",")), array_dims(make_array(vec![lit(1), lit(2), lit(3)])), @@ -701,7 +705,7 @@ async fn roundtrip_expr_api() -> Result<()> { bit_and(lit(2)), bit_or(lit(2)), bit_xor(lit(2)), - string_agg(col("a").cast_to(&DataType::Utf8, &schema)?, lit("|")), + string_agg(col("a").cast_to(&DataType::Utf8.into(), &schema)?, lit("|")), bool_and(lit(true)), bool_or(lit(true)), ]; @@ -1606,13 +1610,18 @@ fn roundtrip_schema() { fn roundtrip_dfschema() { let dfschema = DFSchema::new_with_metadata( vec![ - (None, Arc::new(Field::new("a", DataType::Int64, false))), + ( + None, + Arc::new(LogicalPhysicalField::new("a", DataType::Int64, false)), + ), ( Some("t".into()), Arc::new( - Field::new("b", DataType::Decimal128(15, 2), true).with_metadata( - HashMap::from([(String::from("k1"), String::from("v1"))]), - ), + LogicalPhysicalField::new("b", DataType::Decimal128(15, 2), true) + .with_metadata(HashMap::from([( + String::from("k1"), + String::from("v1"), + )])), ), ), ], diff --git a/datafusion/sql/examples/sql.rs b/datafusion/sql/examples/sql.rs index aee4cf5a38ed..c5888682d596 100644 --- a/datafusion/sql/examples/sql.rs +++ b/datafusion/sql/examples/sql.rs @@ -17,6 +17,7 @@ use arrow_schema::{DataType, Field, Schema}; use datafusion_common::config::ConfigOptions; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{plan_err, Result}; use datafusion_expr::WindowUDF; use datafusion_expr::{ @@ -132,7 +133,7 @@ impl ContextProvider for MyContextProvider { self.udafs.get(name).cloned() } - fn get_variable_type(&self, _variable_names: &[String]) -> Option { + fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } diff --git a/datafusion/sql/src/expr/function.rs b/datafusion/sql/src/expr/function.rs index d9ddf57eb192..034dee9c5747 100644 --- a/datafusion/sql/src/expr/function.rs +++ b/datafusion/sql/src/expr/function.rs @@ -16,7 +16,8 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use arrow_schema::DataType; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::{ internal_datafusion_err, not_impl_err, plan_datafusion_err, plan_err, DFSchema, Dependency, Result, @@ -475,12 +476,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { pub(crate) fn check_unnest_arg(arg: &Expr, schema: &DFSchema) -> Result<()> { // Check argument type, array types are supported - match arg.get_type(schema)? { - DataType::List(_) - | DataType::LargeList(_) - | DataType::FixedSizeList(_, _) - | DataType::Struct(_) => Ok(()), - DataType::Null => { + match arg.get_type(schema)?.logical() { + LogicalType::List(_) | LogicalType::Struct(_) => Ok(()), + LogicalType::Null => { not_impl_err!("unnest() does not support null yet") } _ => { diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index d297b2e4df5b..f391b9ec02ea 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -16,7 +16,7 @@ // under the License. use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; -use arrow_schema::Field; +use datafusion_common::logical_type::field::LogicalPhysicalField; use datafusion_common::{ internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, ScalarValue, TableReference, @@ -280,7 +280,7 @@ fn search_dfschema<'ids, 'schema>( ids: &'ids [String], schema: &'schema DFSchema, ) -> Option<( - &'schema Field, + &'schema LogicalPhysicalField, Option<&'schema TableReference>, &'ids [String], )> { diff --git a/datafusion/sql/src/expr/mod.rs b/datafusion/sql/src/expr/mod.rs index 0546a101fcb2..d56414db36cf 100644 --- a/datafusion/sql/src/expr/mod.rs +++ b/datafusion/sql/src/expr/mod.rs @@ -15,8 +15,8 @@ // specific language governing permissions and limitations // under the License. -use arrow_schema::DataType; -use arrow_schema::TimeUnit; +use arrow_schema::{DataType, TimeUnit}; +use datafusion_common::logical_type::signature::LogicalType; use datafusion_expr::planner::PlannerResult; use datafusion_expr::planner::RawDictionaryExpr; use datafusion_expr::planner::RawFieldAccessExpr; @@ -25,6 +25,7 @@ use sqlparser::ast::{ Value, }; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema, Result, ScalarValue, @@ -329,13 +330,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // numeric constants are treated as seconds (rather as nanoseconds) // to align with postgres / duckdb semantics - let expr = match &dt { - DataType::Timestamp(TimeUnit::Nanosecond, tz) - if expr.get_type(schema)? == DataType::Int64 => + let expr = match dt.logical() { + LogicalType::Timestamp(TimeUnit::Nanosecond, tz) + if expr.get_type(schema)?.logical() == &LogicalType::Int64 => { Expr::Cast(Cast::new( Box::new(expr), - DataType::Timestamp(TimeUnit::Second, tz.clone()), + LogicalPhysicalType::from(DataType::Timestamp( + TimeUnit::Second, + tz.clone(), + )), )) } _ => expr, @@ -613,9 +617,9 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { planner_context, )?), match *time_zone { - SQLExpr::Value(Value::SingleQuotedString(s)) => { - DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())) - } + SQLExpr::Value(Value::SingleQuotedString(s)) => LogicalPhysicalType::from( + DataType::Timestamp(TimeUnit::Nanosecond, Some(s.into())), + ), _ => { return not_impl_err!( "Unsupported ast node in sqltorel: {time_zone:?}" @@ -804,7 +808,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) -> Result { let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?; let pattern_type = pattern.get_type(schema)?; - if pattern_type != DataType::Utf8 && pattern_type != DataType::Null { + if !matches!( + pattern_type.logical(), + LogicalType::Utf8 | LogicalType::Null + ) { return plan_err!("Invalid pattern in LIKE expression"); } let escape_char = if let Some(char) = escape_char { @@ -835,7 +842,10 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { ) -> Result { let pattern = self.sql_expr_to_logical_expr(pattern, schema, planner_context)?; let pattern_type = pattern.get_type(schema)?; - if pattern_type != DataType::Utf8 && pattern_type != DataType::Null { + if !matches!( + pattern_type.logical(), + LogicalType::Utf8 | LogicalType::Null + ) { return plan_err!("Invalid pattern in SIMILAR TO expression"); } let escape_char = if let Some(char) = escape_char { @@ -943,6 +953,7 @@ mod tests { use std::sync::Arc; use arrow::datatypes::{Field, Schema}; + use arrow_schema::DataType; use sqlparser::dialect::GenericDialect; use sqlparser::parser::Parser; @@ -994,7 +1005,7 @@ mod tests { None } - fn get_variable_type(&self, _variable_names: &[String]) -> Option { + fn get_variable_type(&self, _variable_names: &[String]) -> Option { None } diff --git a/datafusion/sql/src/expr/value.rs b/datafusion/sql/src/expr/value.rs index 5cd6ffc68788..4c3db1a15d06 100644 --- a/datafusion/sql/src/expr/value.rs +++ b/datafusion/sql/src/expr/value.rs @@ -18,7 +18,7 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow::compute::kernels::cast_utils::parse_interval_month_day_nano; use arrow::datatypes::DECIMAL128_MAX_PRECISION; -use arrow_schema::DataType; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{ internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue, }; @@ -34,7 +34,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { pub(crate) fn parse_value( &self, value: Value, - param_data_types: &[DataType], + param_data_types: &[LogicalPhysicalType], ) -> Result { match value { Value::Number(n, _) => self.parse_sql_number(&n, false), @@ -97,7 +97,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { /// number 1, 2, ... etc. For example, `$1` is the first placeholder; $2 is the second one and so on. fn create_placeholder_expr( param: String, - param_data_types: &[DataType], + param_data_types: &[LogicalPhysicalType], ) -> Result { // Parse the placeholder as a number because it is the only support from sqlparser and postgres let index = param[1..].parse::(); diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index a77f0003f738..b903be8aee68 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -30,6 +30,10 @@ use sqlparser::ast::{ArrayElemTypeDef, ExactNumberInfo}; use sqlparser::ast::{ColumnDef as SQLColumnDef, ColumnOption}; use sqlparser::ast::{DataType as SQLDataType, Ident, ObjectName, TableAlias}; +use datafusion_common::logical_type::field::LogicalPhysicalField; +use datafusion_common::logical_type::fields::LogicalPhysicalFields; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::TableReference; use datafusion_common::{ not_impl_err, plan_err, unqualified_field_not_found, DFSchema, DataFusionError, @@ -103,7 +107,7 @@ impl IdentNormalizer { pub struct PlannerContext { /// Data types for numbered parameters ($1, $2, etc), if supplied /// in `PREPARE` statement - prepare_param_data_types: Arc>, + prepare_param_data_types: Arc>, /// Map of CTE name to logical plan of the WITH clause. /// Use `Arc` to allow cheap cloning ctes: HashMap>, @@ -130,7 +134,7 @@ impl PlannerContext { /// Update the PlannerContext with provided prepare_param_data_types pub fn with_prepare_param_data_types( mut self, - prepare_param_data_types: Vec, + prepare_param_data_types: Vec, ) -> Self { self.prepare_param_data_types = prepare_param_data_types.into(); self @@ -152,7 +156,7 @@ impl PlannerContext { } /// Return the types of parameters (`$1`, `$2`, etc) if known - pub fn prepare_param_data_types(&self) -> &[DataType] { + pub fn prepare_param_data_types(&self) -> &[LogicalPhysicalType] { &self.prepare_param_data_types } @@ -214,7 +218,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - pub fn build_schema(&self, columns: Vec) -> Result { + pub fn build_schema(&self, columns: Vec) -> Result { let mut fields = Vec::with_capacity(columns.len()); for column in columns { @@ -223,14 +227,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .options .iter() .any(|x| x.option == ColumnOption::NotNull); - fields.push(Field::new( + fields.push(LogicalPhysicalField::new( self.normalizer.normalize(column.name), data_type, !not_nullable, )); } - Ok(Schema::new(fields)) + Ok(LogicalPhysicalSchema::new(fields)) } /// Returns a vector of (column_name, default_expr) pairs @@ -334,13 +338,16 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }) } - pub(crate) fn convert_data_type(&self, sql_type: &SQLDataType) -> Result { + pub(crate) fn convert_data_type( + &self, + sql_type: &SQLDataType, + ) -> Result { match sql_type { SQLDataType::Array(ArrayElemTypeDef::AngleBracket(inner_sql_type)) | SQLDataType::Array(ArrayElemTypeDef::SquareBracket(inner_sql_type, _)) => { // Arrays may be multi-dimensional. let inner_data_type = self.convert_data_type(inner_sql_type)?; - Ok(DataType::new_list(inner_data_type, true)) + Ok(LogicalPhysicalType::new_list(inner_data_type, true).into()) } SQLDataType::Array(ArrayElemTypeDef::None) => { not_impl_err!("Arrays with unspecified type is not supported") @@ -349,31 +356,31 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { } } - fn convert_simple_data_type(&self, sql_type: &SQLDataType) -> Result { + fn convert_simple_data_type(&self, sql_type: &SQLDataType) -> Result { match sql_type { - SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean), - SQLDataType::TinyInt(_) => Ok(DataType::Int8), - SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16), - SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32), - SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64), - SQLDataType::UnsignedTinyInt(_) => Ok(DataType::UInt8), - SQLDataType::UnsignedSmallInt(_) | SQLDataType::UnsignedInt2(_) => Ok(DataType::UInt16), + SQLDataType::Boolean | SQLDataType::Bool => Ok(DataType::Boolean.into()), + SQLDataType::TinyInt(_) => Ok(DataType::Int8.into()), + SQLDataType::SmallInt(_) | SQLDataType::Int2(_) => Ok(DataType::Int16.into()), + SQLDataType::Int(_) | SQLDataType::Integer(_) | SQLDataType::Int4(_) => Ok(DataType::Int32.into()), + SQLDataType::BigInt(_) | SQLDataType::Int8(_) => Ok(DataType::Int64.into()), + SQLDataType::UnsignedTinyInt(_) => Ok(DataType::UInt8.into()), + SQLDataType::UnsignedSmallInt(_) | SQLDataType::UnsignedInt2(_) => Ok(DataType::UInt16.into()), SQLDataType::UnsignedInt(_) | SQLDataType::UnsignedInteger(_) | SQLDataType::UnsignedInt4(_) => { - Ok(DataType::UInt32) + Ok(DataType::UInt32.into()) } SQLDataType::Varchar(length) => { match (length, self.options.support_varchar_with_length) { (Some(_), false) => plan_err!("does not support Varchar with length, please set `support_varchar_with_length` to be true"), - _ => Ok(DataType::Utf8), + _ => Ok(DataType::Utf8.into()), } } - SQLDataType::UnsignedBigInt(_) | SQLDataType::UnsignedInt8(_) => Ok(DataType::UInt64), - SQLDataType::Float(_) => Ok(DataType::Float32), - SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32), - SQLDataType::Double | SQLDataType::DoublePrecision | SQLDataType::Float8 => Ok(DataType::Float64), + SQLDataType::UnsignedBigInt(_) | SQLDataType::UnsignedInt8(_) => Ok(DataType::UInt64.into()), + SQLDataType::Float(_) => Ok(DataType::Float32.into()), + SQLDataType::Real | SQLDataType::Float4 => Ok(DataType::Float32.into()), + SQLDataType::Double | SQLDataType::DoublePrecision | SQLDataType::Float8 => Ok(DataType::Float64.into()), SQLDataType::Char(_) | SQLDataType::Text - | SQLDataType::String(_) => Ok(DataType::Utf8), + | SQLDataType::String(_) => Ok(DataType::Utf8.into()), SQLDataType::Timestamp(None, tz_info) => { let tz = if matches!(tz_info, TimezoneInfo::Tz) || matches!(tz_info, TimezoneInfo::WithTimeZone) @@ -386,14 +393,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Timestamp Without Time zone None }; - Ok(DataType::Timestamp(TimeUnit::Nanosecond, tz.map(Into::into))) + Ok(DataType::Timestamp(TimeUnit::Nanosecond, tz.map(Into::into)).into()) } - SQLDataType::Date => Ok(DataType::Date32), + SQLDataType::Date => Ok(DataType::Date32.into()), SQLDataType::Time(None, tz_info) => { if matches!(tz_info, TimezoneInfo::None) || matches!(tz_info, TimezoneInfo::WithoutTimeZone) { - Ok(DataType::Time64(TimeUnit::Nanosecond)) + Ok(DataType::Time64(TimeUnit::Nanosecond).into()) } else { // We dont support TIMETZ and TIME WITH TIME ZONE for now not_impl_err!( @@ -412,8 +419,8 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { }; make_decimal_type(precision, scale) } - SQLDataType::Bytea => Ok(DataType::Binary), - SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano)), + SQLDataType::Bytea => Ok(DataType::Binary.into()), + SQLDataType::Interval => Ok(DataType::Interval(IntervalUnit::MonthDayNano).into()), SQLDataType::Struct(fields) => { let fields = fields .iter() @@ -424,14 +431,14 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Some(ident) => ident.clone(), None => Ident::new(format!("c{idx}")) }; - Ok(Arc::new(Field::new( + Ok(Arc::new(LogicalPhysicalField::new( self.normalizer.normalize(field_name), data_type, true, ))) }) .collect::>>()?; - Ok(DataType::Struct(Fields::from(fields))) + Ok(LogicalPhysicalType::new_struct(LogicalPhysicalFields::from(fields))) } // Explicitly list all other types so that if sqlparser // adds/changes the `SQLDataType` the compiler will tell us on upgrade diff --git a/datafusion/sql/src/relation/join.rs b/datafusion/sql/src/relation/join.rs index ee2e35b550f6..8978cb10b768 100644 --- a/datafusion/sql/src/relation/join.rs +++ b/datafusion/sql/src/relation/join.rs @@ -122,7 +122,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { .build() } JoinConstraint::Natural => { - let left_cols: HashSet<&String> = + let left_cols: HashSet<&str> = left.schema().fields().iter().map(|f| f.name()).collect(); let keys: Vec = right .schema() diff --git a/datafusion/sql/src/statement.rs b/datafusion/sql/src/statement.rs index 6cdb2f959cd8..6c63a3be4830 100644 --- a/datafusion/sql/src/statement.rs +++ b/datafusion/sql/src/statement.rs @@ -15,11 +15,6 @@ // specific language governing permissions and limitations // under the License. -use std::collections::{BTreeMap, HashMap, HashSet}; -use std::path::Path; -use std::str::FromStr; -use std::sync::Arc; - use crate::parser::{ CopyToSource, CopyToStatement, CreateExternalTable, DFParser, ExplainStatement, LexOrdering, Statement as DFStatement, @@ -28,8 +23,15 @@ use crate::planner::{ object_name_to_qualifier, ContextProvider, PlannerContext, SqlToRel, }; use crate::utils::normalize_ident; +use arrow_schema::SchemaRef; +use std::collections::{BTreeMap, HashMap, HashSet}; +use std::path::Path; +use std::str::FromStr; +use std::sync::Arc; -use arrow_schema::{DataType, Fields}; +use datafusion_common::logical_type::fields::LogicalPhysicalFields; +use datafusion_common::logical_type::schema::LogicalPhysicalSchema; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::{ exec_err, not_impl_err, plan_datafusion_err, plan_err, schema_err, @@ -453,7 +455,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { statement, } => { // Convert parser data types to DataFusion data types - let data_types: Vec = data_types + let data_types: Vec = data_types .into_iter() .map(|t| self.convert_data_type(&t)) .collect::>()?; @@ -854,7 +856,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let output_schema = DFSchema::try_from(LogicalPlan::describe_schema()).unwrap(); Ok(LogicalPlan::DescribeTable(DescribeTable { - schema, + schema: SchemaRef::new(schema.as_ref().clone().into()), output_schema: Arc::new(output_schema), })) } @@ -1223,7 +1225,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do a table lookup to verify the table exists let table_ref = self.object_name_to_table_reference(table_name.clone())?; let table_source = self.context_provider.get_table_source(table_ref.clone())?; - let schema = (*table_source.schema()).clone(); + let schema: LogicalPhysicalSchema = (*table_source.schema()).clone().into(); let schema = DFSchema::try_from(schema)?; let scan = LogicalPlanBuilder::scan( object_name_to_string(&table_name), @@ -1276,7 +1278,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { let table_source = self.context_provider.get_table_source(table_name.clone())?; let table_schema = Arc::new(DFSchema::try_from_qualified_schema( table_name.clone(), - &table_source.schema(), + &table_source.schema().as_ref().clone().into(), )?); // Overwrite with assignment expressions @@ -1381,7 +1383,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Do a table lookup to verify the table exists let table_name = self.object_name_to_table_reference(table_name)?; let table_source = self.context_provider.get_table_source(table_name.clone())?; - let arrow_schema = (*table_source.schema()).clone(); + let arrow_schema: LogicalPhysicalSchema = (*table_source.schema()).clone().into(); let table_schema = DFSchema::try_from(arrow_schema)?; // Get insert fields and target table's value indices @@ -1419,7 +1421,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(table_schema.field(column_index).clone()) }) .collect::>>()?; - (Fields::from(fields), value_indices) + (LogicalPhysicalFields::from(fields), value_indices) }; // infer types for Values clause... other types should be resolvable the regular way diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index 198186934c84..09a80df44712 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -27,13 +27,14 @@ use arrow_array::types::{ TimestampNanosecondType, TimestampSecondType, }; use arrow_array::{Date32Array, Date64Array, PrimitiveArray}; -use arrow_schema::DataType; +use datafusion_common::logical_type::signature::LogicalType; use sqlparser::ast::Value::SingleQuotedString; use sqlparser::ast::{ self, Expr as AstExpr, Function, FunctionArg, Ident, Interval, TimezoneInfo, UnaryOperator, }; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::{ internal_datafusion_err, internal_err, not_impl_err, plan_err, Column, Result, ScalarValue, @@ -958,26 +959,29 @@ impl Unparser<'_> { } } - fn arrow_dtype_to_ast_dtype(&self, data_type: &DataType) -> Result { - match data_type { - DataType::Null => { + fn arrow_dtype_to_ast_dtype( + &self, + data_type: &LogicalPhysicalType, + ) -> Result { + match data_type.logical() { + LogicalType::Null => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::Boolean => Ok(ast::DataType::Bool), - DataType::Int8 => Ok(ast::DataType::TinyInt(None)), - DataType::Int16 => Ok(ast::DataType::SmallInt(None)), - DataType::Int32 => Ok(ast::DataType::Integer(None)), - DataType::Int64 => Ok(ast::DataType::BigInt(None)), - DataType::UInt8 => Ok(ast::DataType::UnsignedTinyInt(None)), - DataType::UInt16 => Ok(ast::DataType::UnsignedSmallInt(None)), - DataType::UInt32 => Ok(ast::DataType::UnsignedInteger(None)), - DataType::UInt64 => Ok(ast::DataType::UnsignedBigInt(None)), - DataType::Float16 => { + LogicalType::Boolean => Ok(ast::DataType::Bool), + LogicalType::Int8 => Ok(ast::DataType::TinyInt(None)), + LogicalType::Int16 => Ok(ast::DataType::SmallInt(None)), + LogicalType::Int32 => Ok(ast::DataType::Integer(None)), + LogicalType::Int64 => Ok(ast::DataType::BigInt(None)), + LogicalType::UInt8 => Ok(ast::DataType::UnsignedTinyInt(None)), + LogicalType::UInt16 => Ok(ast::DataType::UnsignedSmallInt(None)), + LogicalType::UInt32 => Ok(ast::DataType::UnsignedInteger(None)), + LogicalType::UInt64 => Ok(ast::DataType::UnsignedBigInt(None)), + LogicalType::Float16 => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::Float32 => Ok(ast::DataType::Float(None)), - DataType::Float64 => Ok(ast::DataType::Double), - DataType::Timestamp(_, tz) => { + LogicalType::Float32 => Ok(ast::DataType::Float(None)), + LogicalType::Float64 => Ok(ast::DataType::Double), + LogicalType::Timestamp(_, tz) => { let tz_info = match tz { Some(_) => TimezoneInfo::WithTimeZone, None => TimezoneInfo::None, @@ -985,63 +989,34 @@ impl Unparser<'_> { Ok(ast::DataType::Timestamp(None, tz_info)) } - DataType::Date32 => Ok(ast::DataType::Date), - DataType::Date64 => Ok(ast::DataType::Datetime(None)), - DataType::Time32(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Time64(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Duration(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Interval(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Binary => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::FixedSizeBinary(_) => { + LogicalType::Date => Ok(ast::DataType::Date), + LogicalType::Time32(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::LargeBinary => { + LogicalType::Time64(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::BinaryView => { + LogicalType::Duration(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::Utf8 => Ok(ast::DataType::Varchar(None)), - DataType::LargeUtf8 => Ok(ast::DataType::Text), - DataType::Utf8View => { + LogicalType::Interval(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::List(_) => { + LogicalType::Binary => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::FixedSizeList(_, _) => { + LogicalType::Utf8 => Ok(ast::DataType::Varchar(None)), + LogicalType::List(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::LargeList(_) => { + LogicalType::Struct(_) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::ListView(_) => { + LogicalType::Union(_, _) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } - DataType::LargeListView(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Struct(_) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Union(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Dictionary(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::Decimal128(precision, scale) - | DataType::Decimal256(precision, scale) => { + LogicalType::Decimal128(precision, scale) + | LogicalType::Decimal256(precision, scale) => { let mut new_precision = *precision as u64; let mut new_scale = *scale as u64; if *scale < 0 { @@ -1053,10 +1028,7 @@ impl Unparser<'_> { ast::ExactNumberInfo::PrecisionAndScale(new_precision, new_scale), )) } - DataType::Map(_, _) => { - not_impl_err!("Unsupported DataType: conversion: {data_type:?}") - } - DataType::RunEndEncoded(_, _) => { + LogicalType::Map(_, _) => { not_impl_err!("Unsupported DataType: conversion: {data_type:?}") } } @@ -1070,7 +1042,7 @@ mod tests { use arrow::datatypes::TimeUnit; use arrow::datatypes::{Field, Schema}; - use arrow_schema::DataType::Int8; + use arrow_schema::DataType; use datafusion_common::TableReference; use datafusion_expr::{ case, col, cube, exists, grouping_set, interval_datetime_lit, @@ -1159,7 +1131,7 @@ mod tests { ( Expr::Cast(Cast { expr: Box::new(col("a")), - data_type: DataType::Date64, + data_type: DataType::Date64.into(), }), r#"CAST(a AS DATETIME)"#, ), @@ -1183,7 +1155,7 @@ mod tests { ( Expr::Cast(Cast { expr: Box::new(col("a")), - data_type: DataType::UInt32, + data_type: DataType::UInt32.into(), }), r#"CAST(a AS INTEGER UNSIGNED)"#, ), @@ -1417,12 +1389,12 @@ mod tests { r#"TRY_CAST(a AS INTEGER UNSIGNED)"#, ), ( - Expr::ScalarVariable(Int8, vec![String::from("@a")]), + Expr::ScalarVariable(DataType::Int8.into(), vec![String::from("@a")]), r#"@a"#, ), ( Expr::ScalarVariable( - Int8, + DataType::Int8.into(), vec![String::from("@root"), String::from("foo")], ), r#"@root.foo"#, @@ -1504,7 +1476,7 @@ mod tests { ( Expr::Cast(Cast { expr: Box::new(col("a")), - data_type: DataType::Decimal128(10, -2), + data_type: DataType::Decimal128(10, -2).into(), }), r#"CAST(a AS DECIMAL(12,0))"#, ), diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 2eacbd174fc2..2d7cc56753ad 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -22,6 +22,8 @@ use std::collections::HashMap; use arrow_schema::{ DataType, DECIMAL128_MAX_PRECISION, DECIMAL256_MAX_PRECISION, DECIMAL_DEFAULT_SCALE, }; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion_common::tree_node::{ Transformed, TransformedResult, TreeNode, TreeNodeRecursion, }; @@ -228,7 +230,7 @@ pub fn window_expr_common_partition_keys(window_exprs: &[Expr]) -> Result<&[Expr pub(crate) fn make_decimal_type( precision: Option, scale: Option, -) -> Result { +) -> Result { // postgres like behavior let (precision, scale) = match (precision, scale) { (Some(p), Some(s)) => (p as u8, s as i8), @@ -249,9 +251,9 @@ pub(crate) fn make_decimal_type( } else if precision > DECIMAL128_MAX_PRECISION && precision <= DECIMAL256_MAX_PRECISION { - Ok(DataType::Decimal256(precision, scale)) + Ok(DataType::Decimal256(precision, scale).into()) } else { - Ok(DataType::Decimal128(precision, scale)) + Ok(DataType::Decimal128(precision, scale).into()) } } @@ -324,7 +326,7 @@ pub(crate) fn transform_bottom_unnest( if let Expr::Unnest(Unnest { expr: ref arg }) = expr { let (data_type, _) = arg.data_type_and_nullable(input.schema())?; - if let DataType::Struct(_) = data_type { + if let LogicalType::Struct(_) = data_type.logical() { return internal_err!("unnest on struct can ony be applied at the root level of select expression"); } diff --git a/datafusion/sql/tests/cases/plan_to_sql.rs b/datafusion/sql/tests/cases/plan_to_sql.rs index 374403d853f9..e672d228cf4c 100644 --- a/datafusion/sql/tests/cases/plan_to_sql.rs +++ b/datafusion/sql/tests/cases/plan_to_sql.rs @@ -28,11 +28,10 @@ use datafusion_sql::unparser::dialect::{ }; use datafusion_sql::unparser::{expr_to_sql, plan_to_sql, Unparser}; +use crate::common::MockContextProvider; use sqlparser::dialect::{Dialect, GenericDialect, MySqlDialect}; use sqlparser::parser::Parser; -use crate::common::MockContextProvider; - #[test] fn roundtrip_expr() { let tests: Vec<(TableReference, &str, &str)> = vec![ diff --git a/datafusion/sql/tests/common/mod.rs b/datafusion/sql/tests/common/mod.rs index f5caaefb3ea0..3ef172187718 100644 --- a/datafusion/sql/tests/common/mod.rs +++ b/datafusion/sql/tests/common/mod.rs @@ -24,6 +24,8 @@ use std::{sync::Arc, vec}; use arrow_schema::*; use datafusion_common::config::ConfigOptions; use datafusion_common::file_options::file_type::FileType; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_common::{plan_err, GetExt, Result, TableReference}; use datafusion_expr::{AggregateUDF, ScalarUDF, TableSource, WindowUDF}; use datafusion_sql::planner::ContextProvider; @@ -202,7 +204,7 @@ impl ContextProvider for MockContextProvider { self.udafs.get(name).cloned() } - fn get_variable_type(&self, _: &[String]) -> Option { + fn get_variable_type(&self, _: &[String]) -> Option { unimplemented!() } @@ -257,7 +259,7 @@ impl TableSource for EmptyTable { self } - fn schema(&self) -> SchemaRef { - self.table_schema.clone() + fn schema(&self) -> LogicalPhysicalSchemaRef { + LogicalPhysicalSchemaRef::new(self.table_schema.clone().into()) } } diff --git a/datafusion/sql/tests/sql_integration.rs b/datafusion/sql/tests/sql_integration.rs index aca0d040bb8d..2c1a10370cb7 100644 --- a/datafusion/sql/tests/sql_integration.rs +++ b/datafusion/sql/tests/sql_integration.rs @@ -37,6 +37,7 @@ use datafusion_sql::{ planner::{ParserOptions, SqlToRel}, }; +use datafusion_common::logical_type::LogicalPhysicalType; use datafusion_functions_aggregate::{ approx_median::approx_median_udaf, count::count_udaf, }; @@ -3647,8 +3648,8 @@ fn test_prepare_statement_should_infer_types() { let plan = logical_plan(sql).unwrap(); let actual_types = plan.get_parameter_types().unwrap(); let expected_types = HashMap::from([ - ("$1".to_string(), Some(DataType::Int32)), - ("$2".to_string(), Some(DataType::Int64)), + ("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int32))), + ("$2".to_string(), Some(LogicalPhysicalType::from(DataType::Int64))), ]); assert_eq!(actual_types, expected_types); } @@ -3661,7 +3662,7 @@ fn test_non_prepare_statement_should_infer_types() { let actual_types = plan.get_parameter_types().unwrap(); let expected_types = HashMap::from([ // constant 1 is inferred to be int64 - ("$1".to_string(), Some(DataType::Int64)), + ("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int64))), ]); assert_eq!(actual_types, expected_types); } @@ -3836,7 +3837,8 @@ Projection: person.id, orders.order_id let plan = prepare_stmt_quick_test(sql, expected_plan, expected_dt); let actual_types = plan.get_parameter_types().unwrap(); - let expected_types = HashMap::from([("$1".to_string(), Some(DataType::Int32))]); + let expected_types = + HashMap::from([("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int32)))]); assert_eq!(actual_types, expected_types); // replace params with values @@ -3868,7 +3870,8 @@ Projection: person.id, person.age let plan = prepare_stmt_quick_test(sql, expected_plan, expected_dt); let actual_types = plan.get_parameter_types().unwrap(); - let expected_types = HashMap::from([("$1".to_string(), Some(DataType::Int32))]); + let expected_types = + HashMap::from([("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int32)))]); assert_eq!(actual_types, expected_types); // replace params with values @@ -3900,8 +3903,8 @@ Projection: person.id, person.age let actual_types = plan.get_parameter_types().unwrap(); let expected_types = HashMap::from([ - ("$1".to_string(), Some(DataType::Int32)), - ("$2".to_string(), Some(DataType::Int32)), + ("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int32))), + ("$2".to_string(), Some(LogicalPhysicalType::from(DataType::Int32))), ]); assert_eq!(actual_types, expected_types); @@ -3939,7 +3942,8 @@ Projection: person.id, person.age let plan = prepare_stmt_quick_test(sql, expected_plan, expected_dt); let actual_types = plan.get_parameter_types().unwrap(); - let expected_types = HashMap::from([("$1".to_string(), Some(DataType::UInt32))]); + let expected_types = + HashMap::from([("$1".to_string(), Some(LogicalPhysicalType::from(DataType::UInt32)))]); assert_eq!(actual_types, expected_types); // replace params with values @@ -3977,8 +3981,8 @@ Dml: op=[Update] table=[person] let actual_types = plan.get_parameter_types().unwrap(); let expected_types = HashMap::from([ - ("$1".to_string(), Some(DataType::Int32)), - ("$2".to_string(), Some(DataType::UInt32)), + ("$1".to_string(), Some(LogicalPhysicalType::from(DataType::Int32))), + ("$2".to_string(), Some(LogicalPhysicalType::from(DataType::UInt32))), ]); assert_eq!(actual_types, expected_types); @@ -4012,9 +4016,9 @@ fn test_prepare_statement_insert_infer() { let actual_types = plan.get_parameter_types().unwrap(); let expected_types = HashMap::from([ - ("$1".to_string(), Some(DataType::UInt32)), - ("$2".to_string(), Some(DataType::Utf8)), - ("$3".to_string(), Some(DataType::Utf8)), + ("$1".to_string(), Some(LogicalPhysicalType::from(DataType::UInt32))), + ("$2".to_string(), Some(LogicalPhysicalType::from(DataType::Utf8))), + ("$3".to_string(), Some(LogicalPhysicalType::from(DataType::Utf8))), ]); assert_eq!(actual_types, expected_types); diff --git a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs index 520b6b53b32d..4791669b10b2 100644 --- a/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs +++ b/datafusion/sqllogictest/src/engines/datafusion_engine/normalize.rs @@ -15,10 +15,12 @@ // specific language governing permissions and limitations // under the License. -use arrow::datatypes::Fields; use arrow::util::display::ArrayFormatter; use arrow::{array, array::ArrayRef, datatypes::DataType, record_batch::RecordBatch}; use datafusion_common::format::DEFAULT_FORMAT_OPTIONS; +use datafusion_common::logical_type::fields::LogicalPhysicalFields; +use datafusion_common::logical_type::signature::LogicalType; +use datafusion_common::logical_type::TypeRelation; use datafusion_common::DataFusionError; use std::path::PathBuf; use std::sync::OnceLock; @@ -243,31 +245,30 @@ pub fn cell_to_string(col: &ArrayRef, row: usize) -> Result { } /// Converts columns to a result as expected by sqllogicteset. -pub(crate) fn convert_schema_to_types(columns: &Fields) -> Vec { +pub(crate) fn convert_schema_to_types(columns: &LogicalPhysicalFields) -> Vec { columns .iter() .map(|f| f.data_type()) - .map(|data_type| match data_type { - DataType::Boolean => DFColumnType::Boolean, - DataType::Int8 - | DataType::Int16 - | DataType::Int32 - | DataType::Int64 - | DataType::UInt8 - | DataType::UInt16 - | DataType::UInt32 - | DataType::UInt64 => DFColumnType::Integer, - DataType::Float16 - | DataType::Float32 - | DataType::Float64 - | DataType::Decimal128(_, _) - | DataType::Decimal256(_, _) => DFColumnType::Float, - DataType::Utf8 | DataType::LargeUtf8 => DFColumnType::Text, - DataType::Date32 - | DataType::Date64 - | DataType::Time32(_) - | DataType::Time64(_) => DFColumnType::DateTime, - DataType::Timestamp(_, _) => DFColumnType::Timestamp, + .map(|data_type| match data_type.logical() { + LogicalType::Boolean => DFColumnType::Boolean, + LogicalType::Int8 + | LogicalType::Int16 + | LogicalType::Int32 + | LogicalType::Int64 + | LogicalType::UInt8 + | LogicalType::UInt16 + | LogicalType::UInt32 + | LogicalType::UInt64 => DFColumnType::Integer, + LogicalType::Float16 + | LogicalType::Float32 + | LogicalType::Float64 + | LogicalType::Decimal128(_, _) + | LogicalType::Decimal256(_, _) => DFColumnType::Float, + LogicalType::Utf8 => DFColumnType::Text, + LogicalType::Date | LogicalType::Time32(_) | LogicalType::Time64(_) => { + DFColumnType::DateTime + } + LogicalType::Timestamp(_, _) => DFColumnType::Timestamp, _ => DFColumnType::Another, }) .collect() diff --git a/datafusion/sqllogictest/src/test_context.rs b/datafusion/sqllogictest/src/test_context.rs index dd27727e3ad5..1b5f19b29885 100644 --- a/datafusion/sqllogictest/src/test_context.rs +++ b/datafusion/sqllogictest/src/test_context.rs @@ -25,7 +25,7 @@ use arrow::array::{ ArrayRef, BinaryArray, Float64Array, Int32Array, LargeBinaryArray, LargeStringArray, StringArray, TimestampNanosecondArray, }; -use arrow::datatypes::{DataType, Field, Schema, SchemaRef, TimeUnit}; +use arrow::datatypes::{DataType, Field, Schema, TimeUnit}; use arrow::record_batch::RecordBatch; use datafusion::execution::context::SessionState; use datafusion::logical_expr::{create_udf, ColumnarValue, Expr, ScalarUDF, Volatility}; @@ -40,6 +40,7 @@ use datafusion_common::cast::as_float64_array; use datafusion_common::DataFusionError; use async_trait::async_trait; +use datafusion_common::logical_type::schema::LogicalPhysicalSchemaRef; use log::info; use tempfile::TempDir; @@ -215,7 +216,7 @@ pub async fn register_temp_table(ctx: &SessionContext) { self.0 } - fn schema(&self) -> SchemaRef { + fn schema(&self) -> LogicalPhysicalSchemaRef { unimplemented!() } diff --git a/datafusion/substrait/src/logical_plan/consumer.rs b/datafusion/substrait/src/logical_plan/consumer.rs index 89a6dde51e42..0113166bb62e 100644 --- a/datafusion/substrait/src/logical_plan/consumer.rs +++ b/datafusion/substrait/src/logical_plan/consumer.rs @@ -21,6 +21,9 @@ use datafusion::arrow::array::GenericListArray; use datafusion::arrow::datatypes::{ DataType, Field, FieldRef, Fields, IntervalUnit, Schema, TimeUnit, }; +use datafusion::common::logical_type::field::LogicalPhysicalField; +use datafusion::common::logical_type::schema::LogicalPhysicalSchema; +use datafusion::common::logical_type::{TypeRelation, LogicalPhysicalType}; use datafusion::common::plan_err; use datafusion::common::{ not_impl_datafusion_err, not_impl_err, plan_datafusion_err, substrait_datafusion_err, @@ -358,20 +361,20 @@ fn make_renamed_schema( let mut name_idx = 0; - let (qualifiers, fields): (_, Vec) = schema + let (qualifiers, fields): (_, Vec) = schema .iter() .map(|(q, f)| { let name = next_struct_field_name(0, dfs_names, &mut name_idx)?; Ok(( q.cloned(), - (**f) - .to_owned() - .with_name(name) - .with_data_type(rename_inner_fields( - f.data_type(), + (**f).to_owned().with_name(name).with_data_type( + rename_inner_fields( + f.data_type().physical(), dfs_names, &mut name_idx, - )?), + )? + .into(), + ), )) }) .collect::>>()? @@ -387,7 +390,7 @@ fn make_renamed_schema( DFSchema::from_field_specific_qualified_schema( qualifiers, - &Arc::new(Schema::new(fields)), + &Arc::new(LogicalPhysicalSchema::new(fields)), ) } @@ -846,7 +849,7 @@ pub async fn from_substrait_rel( } /// (Re)qualify the sides of a join if needed, i.e. if the columns from one side would otherwise -/// conflict with the columns from the other. +/// conflict with the columns from the other. /// Substrait doesn't currently allow specifying aliases, neither for columns nor for tables. For /// Substrait the names don't matter since it only refers to columns by indices, however DataFusion /// requires columns to be uniquely identifiable, in some places (see e.g. DFSchema::check_names). @@ -1344,7 +1347,7 @@ pub async fn from_substrait_rex( } } -pub(crate) fn from_substrait_type_without_names(dt: &Type) -> Result { +pub(crate) fn from_substrait_type_without_names(dt: &Type) -> Result { from_substrait_type(dt, &[], &mut 0) } @@ -1352,77 +1355,77 @@ fn from_substrait_type( dt: &Type, dfs_names: &[String], name_idx: &mut usize, -) -> Result { +) -> Result { match &dt.kind { Some(s_kind) => match s_kind { - r#type::Kind::Bool(_) => Ok(DataType::Boolean), + r#type::Kind::Bool(_) => Ok(DataType::Boolean.into()), r#type::Kind::I8(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int8), - UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt8), + DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int8.into()), + UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt8.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::I16(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int16), - UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt16), + DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int16.into()), + UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt16.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::I32(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int32), - UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt32), + DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int32.into()), + UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt32.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::I64(integer) => match integer.type_variation_reference { - DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int64), - UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt64), + DEFAULT_TYPE_VARIATION_REF => Ok(DataType::Int64.into()), + UNSIGNED_INTEGER_TYPE_VARIATION_REF => Ok(DataType::UInt64.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, - r#type::Kind::Fp32(_) => Ok(DataType::Float32), - r#type::Kind::Fp64(_) => Ok(DataType::Float64), + r#type::Kind::Fp32(_) => Ok(DataType::Float32.into()), + r#type::Kind::Fp64(_) => Ok(DataType::Float64.into()), r#type::Kind::Timestamp(ts) => match ts.type_variation_reference { TIMESTAMP_SECOND_TYPE_VARIATION_REF => { - Ok(DataType::Timestamp(TimeUnit::Second, None)) + Ok(DataType::Timestamp(TimeUnit::Second, None).into()) } TIMESTAMP_MILLI_TYPE_VARIATION_REF => { - Ok(DataType::Timestamp(TimeUnit::Millisecond, None)) + Ok(DataType::Timestamp(TimeUnit::Millisecond, None).into()) } TIMESTAMP_MICRO_TYPE_VARIATION_REF => { - Ok(DataType::Timestamp(TimeUnit::Microsecond, None)) + Ok(DataType::Timestamp(TimeUnit::Microsecond, None).into()) } TIMESTAMP_NANO_TYPE_VARIATION_REF => { - Ok(DataType::Timestamp(TimeUnit::Nanosecond, None)) + Ok(DataType::Timestamp(TimeUnit::Nanosecond, None).into()) } v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::Date(date) => match date.type_variation_reference { - DATE_32_TYPE_VARIATION_REF => Ok(DataType::Date32), - DATE_64_TYPE_VARIATION_REF => Ok(DataType::Date64), + DATE_32_TYPE_VARIATION_REF => Ok(DataType::Date32.into()), + DATE_64_TYPE_VARIATION_REF => Ok(DataType::Date64.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::Binary(binary) => match binary.type_variation_reference { - DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Binary), - LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeBinary), + DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Binary.into()), + LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeBinary.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), }, r#type::Kind::FixedBinary(fixed) => { - Ok(DataType::FixedSizeBinary(fixed.length)) + Ok(DataType::FixedSizeBinary(fixed.length).into()) } r#type::Kind::String(string) => match string.type_variation_reference { - DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8), - LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeUtf8), + DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::Utf8.into()), + LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeUtf8.into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" ), @@ -1432,14 +1435,14 @@ fn from_substrait_type( substrait_datafusion_err!("List type must have inner type") })?; let field = Arc::new(Field::new_list_field( - from_substrait_type(inner_type, dfs_names, name_idx)?, - // We ignore Substrait's nullability here to match to_substrait_literal + from_substrait_type(inner_type, dfs_names, name_idx)?.physical().clone(), + // We ignore Substrait's nullability here to match to_substrait_literal // which always creates nullable lists true, )); match list.type_variation_reference { - DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::List(field)), - LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeList(field)), + DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::List(field).into()), + LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(DataType::LargeList(field).into()), v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" )?, @@ -1454,12 +1457,12 @@ fn from_substrait_type( })?; let key_field = Arc::new(Field::new( "key", - from_substrait_type(key_type, dfs_names, name_idx)?, + from_substrait_type(key_type, dfs_names, name_idx)?.physical().clone(), false, )); let value_field = Arc::new(Field::new( "value", - from_substrait_type(value_type, dfs_names, name_idx)?, + from_substrait_type(value_type, dfs_names, name_idx)?.physical().clone(), true, )); match map.type_variation_reference { @@ -1468,7 +1471,7 @@ fn from_substrait_type( "entries", [key_field, value_field], false, // The inner map field is always non-nullable (Arrow #1697), - )), false)) + )), false).into()) }, v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" @@ -1477,10 +1480,10 @@ fn from_substrait_type( } r#type::Kind::Decimal(d) => match d.type_variation_reference { DECIMAL_128_TYPE_VARIATION_REF => { - Ok(DataType::Decimal128(d.precision as u8, d.scale as i8)) + Ok(DataType::Decimal128(d.precision as u8, d.scale as i8).into()) } DECIMAL_256_TYPE_VARIATION_REF => { - Ok(DataType::Decimal256(d.precision as u8, d.scale as i8)) + Ok(DataType::Decimal256(d.precision as u8, d.scale as i8).into()) } v => not_impl_err!( "Unsupported Substrait type variation {v} of type {s_kind:?}" @@ -1489,13 +1492,13 @@ fn from_substrait_type( r#type::Kind::UserDefined(u) => { match u.type_reference { INTERVAL_YEAR_MONTH_TYPE_REF => { - Ok(DataType::Interval(IntervalUnit::YearMonth)) + Ok(DataType::Interval(IntervalUnit::YearMonth).into()) } INTERVAL_DAY_TIME_TYPE_REF => { - Ok(DataType::Interval(IntervalUnit::DayTime)) + Ok(DataType::Interval(IntervalUnit::DayTime).into()) } INTERVAL_MONTH_DAY_NANO_TYPE_REF => { - Ok(DataType::Interval(IntervalUnit::MonthDayNano)) + Ok(DataType::Interval(IntervalUnit::MonthDayNano).into()) } _ => not_impl_err!( "Unsupported Substrait user defined type with ref {} and variation {}", @@ -1506,9 +1509,9 @@ fn from_substrait_type( }, r#type::Kind::Struct(s) => Ok(DataType::Struct(from_substrait_struct_type( s, dfs_names, name_idx, - )?)), - r#type::Kind::Varchar(_) => Ok(DataType::Utf8), - r#type::Kind::FixedChar(_) => Ok(DataType::Utf8), + )?).into()), + r#type::Kind::Varchar(_) => Ok(DataType::Utf8.into()), + r#type::Kind::FixedChar(_) => Ok(DataType::Utf8.into()), _ => not_impl_err!("Unsupported Substrait type: {s_kind:?}"), }, _ => not_impl_err!("`None` Substrait kind is not supported"), @@ -1524,7 +1527,9 @@ fn from_substrait_struct_type( for (i, f) in s.types.iter().enumerate() { let field = Field::new( next_struct_field_name(i, dfs_names, name_idx)?, - from_substrait_type(f, dfs_names, name_idx)?, + from_substrait_type(f, dfs_names, name_idx)? + .physical() + .clone(), true, // We assume everything to be nullable since that's easier than ensuring it matches ); fields.push(field); @@ -1742,11 +1747,11 @@ fn from_substrait_literal( name_idx, )?; match lit.type_variation_reference { - DEFAULT_CONTAINER_TYPE_VARIATION_REF => { - ScalarValue::List(ScalarValue::new_list_nullable(&[], &element_type)) - } + DEFAULT_CONTAINER_TYPE_VARIATION_REF => ScalarValue::List( + ScalarValue::new_list_nullable(&[], &element_type.physical()), + ), LARGE_CONTAINER_TYPE_VARIATION_REF => ScalarValue::LargeList( - ScalarValue::new_large_list(&[], &element_type), + ScalarValue::new_large_list(&[], &element_type.physical()), ), others => { return substrait_err!("Unknown type variation reference {others}"); @@ -1925,7 +1930,7 @@ fn from_substrait_null( d.scale as i8, )), r#type::Kind::List(l) => { - let field = Field::new_list_field( + let field = LogicalPhysicalField::new_list_field( from_substrait_type( l.r#type.clone().unwrap().as_ref(), dfs_names, @@ -1934,19 +1939,25 @@ fn from_substrait_null( true, ); match l.type_variation_reference { - DEFAULT_CONTAINER_TYPE_VARIATION_REF => Ok(ScalarValue::List( - Arc::new(GenericListArray::new_null(field.into(), 1)), - )), - LARGE_CONTAINER_TYPE_VARIATION_REF => Ok(ScalarValue::LargeList( - Arc::new(GenericListArray::new_null(field.into(), 1)), - )), + DEFAULT_CONTAINER_TYPE_VARIATION_REF => { + Ok(ScalarValue::List(Arc::new(GenericListArray::new_null( + FieldRef::new(field.into()), + 1, + )))) + } + LARGE_CONTAINER_TYPE_VARIATION_REF => { + Ok(ScalarValue::LargeList(Arc::new( + GenericListArray::new_null(FieldRef::new(field.into()), 1), + ))) + } v => not_impl_err!( "Unsupported Substrait type variation {v} of type {kind:?}" ), } } r#type::Kind::Struct(s) => { - let fields = from_substrait_struct_type(s, dfs_names, name_idx)?; + let fields: Fields = + from_substrait_struct_type(s, dfs_names, name_idx)?.into(); Ok(ScalarStructBuilder::new_null(fields)) } _ => not_impl_err!("Unsupported Substrait type for null: {kind:?}"), diff --git a/datafusion/substrait/src/logical_plan/producer.rs b/datafusion/substrait/src/logical_plan/producer.rs index 8d039a050249..4a9fa680f23f 100644 --- a/datafusion/substrait/src/logical_plan/producer.rs +++ b/datafusion/substrait/src/logical_plan/producer.rs @@ -21,19 +21,31 @@ use std::ops::Deref; use std::sync::Arc; use arrow_buffer::ToByteSlice; -use datafusion::arrow::datatypes::IntervalUnit; +use datafusion::arrow::datatypes::{DataType, IntervalUnit}; use datafusion::logical_expr::{ CrossJoin, Distinct, Like, Partitioning, WindowFrameUnits, }; use datafusion::{ - arrow::datatypes::{DataType, TimeUnit}, + arrow::datatypes::TimeUnit, error::{DataFusionError, Result}, logical_expr::{WindowFrame, WindowFrameBound}, prelude::{JoinType, SessionContext}, scalar::ScalarValue, }; +use crate::variation_const::{ + DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF, + DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF, + DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, + INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_DAY_TIME_TYPE_URL, + INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_URL, + INTERVAL_YEAR_MONTH_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_URL, + LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF, + TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF, + TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, +}; use datafusion::arrow::array::{Array, GenericListArray, OffsetSizeTrait}; +use datafusion::common::logical_type::TypeRelation; use datafusion::common::{ exec_err, internal_err, not_impl_err, plan_err, substrait_datafusion_err, }; @@ -91,18 +103,6 @@ use substrait::{ version, }; -use crate::variation_const::{ - DATE_32_TYPE_VARIATION_REF, DATE_64_TYPE_VARIATION_REF, - DECIMAL_128_TYPE_VARIATION_REF, DECIMAL_256_TYPE_VARIATION_REF, - DEFAULT_CONTAINER_TYPE_VARIATION_REF, DEFAULT_TYPE_VARIATION_REF, - INTERVAL_DAY_TIME_TYPE_REF, INTERVAL_DAY_TIME_TYPE_URL, - INTERVAL_MONTH_DAY_NANO_TYPE_REF, INTERVAL_MONTH_DAY_NANO_TYPE_URL, - INTERVAL_YEAR_MONTH_TYPE_REF, INTERVAL_YEAR_MONTH_TYPE_URL, - LARGE_CONTAINER_TYPE_VARIATION_REF, TIMESTAMP_MICRO_TYPE_VARIATION_REF, - TIMESTAMP_MILLI_TYPE_VARIATION_REF, TIMESTAMP_NANO_TYPE_VARIATION_REF, - TIMESTAMP_SECOND_TYPE_VARIATION_REF, UNSIGNED_INTEGER_TYPE_VARIATION_REF, -}; - /// Convert DataFusion LogicalPlan to Substrait Plan pub fn to_substrait_plan(plan: &LogicalPlan, ctx: &SessionContext) -> Result> { // Parse relation nodes @@ -617,7 +617,7 @@ fn to_substrait_named_struct(schema: &DFSchemaRef) -> Result { .iter() .map(|f| { let mut names = vec![f.name().to_string()]; - names.extend(names_dfs(f.data_type())?); + names.extend(names_dfs(f.data_type().physical())?); Ok(names) }) .flatten_ok() @@ -627,7 +627,7 @@ fn to_substrait_named_struct(schema: &DFSchemaRef) -> Result { types: schema .fields() .iter() - .map(|f| to_substrait_type(f.data_type(), f.is_nullable())) + .map(|f| to_substrait_type(f.data_type().physical(), f.is_nullable())) .collect::>()?, type_variation_reference: DEFAULT_TYPE_VARIATION_REF, nullability: r#type::Nullability::Unspecified as i32, @@ -1224,7 +1224,7 @@ pub fn to_substrait_rex( Ok(Expression { rex_type: Some(RexType::Cast(Box::new( substrait::proto::expression::Cast { - r#type: Some(to_substrait_type(data_type, true)?), + r#type: Some(to_substrait_type(data_type.physical(), true)?), input: Some(Box::new(to_substrait_rex( ctx, expr, @@ -2095,7 +2095,7 @@ fn convert_array_to_literal_list( .collect::>>()?; if values.is_empty() { - let et = match to_substrait_type(array.data_type(), array.is_nullable())? { + let et = match to_substrait_type(&array.data_type(), array.is_nullable())? { substrait::proto::Type { kind: Some(r#type::Kind::List(lt)), } => lt.as_ref().to_owned(), @@ -2222,7 +2222,7 @@ mod test { }; use arrow_buffer::{IntervalDayTime, IntervalMonthDayNano}; use datafusion::arrow::array::GenericListArray; - use datafusion::arrow::datatypes::Field; + use datafusion::arrow::datatypes::{DataType, Field}; use datafusion::common::scalar::ScalarStructBuilder; use super::*; @@ -2387,9 +2387,10 @@ mod test { // As DataFusion doesn't consider nullability as a property of the type, but field, // it doesn't matter if we set nullability to true or false here. - let substrait = to_substrait_type(&dt, true)?; + let lt = dt.into(); + let substrait = to_substrait_type(<, true)?; let roundtrip_dt = from_substrait_type_without_names(&substrait)?; - assert_eq!(dt, roundtrip_dt); + assert_eq!(<, roundtrip_dt.physical()); Ok(()) } } diff --git a/dev/changelog/40.0.0.md b/dev/changelog/40.0.0.md new file mode 100644 index 000000000000..72143ae48b28 --- /dev/null +++ b/dev/changelog/40.0.0.md @@ -0,0 +1,371 @@ + + +# Apache DataFusion 40.0.0 Changelog + +This release consists of 263 commits from 64 contributors. See credits at the end of this changelog for more information. + +**Breaking changes:** + +- Convert `StringAgg` to UDAF [#10945](https://github.com/apache/datafusion/pull/10945) (lewiszlw) +- Convert `bool_and` & `bool_or` to UDAF [#11009](https://github.com/apache/datafusion/pull/11009) (jcsherin) +- Convert Average to UDAF #10942 [#10964](https://github.com/apache/datafusion/pull/10964) (dharanad) +- fix: remove the Sized requirement on ExecutionPlan::name() [#11047](https://github.com/apache/datafusion/pull/11047) (waynexia) +- Return `&Arc` reference to inner trait object [#11103](https://github.com/apache/datafusion/pull/11103) (linhr) +- Support COPY TO Externally Defined File Formats, add FileType trait [#11060](https://github.com/apache/datafusion/pull/11060) (devinjdangelo) +- expose table name in proto extension codec [#11139](https://github.com/apache/datafusion/pull/11139) (leoyvens) +- fix(typo): unqualifed to unqualified [#11159](https://github.com/apache/datafusion/pull/11159) (waynexia) +- Consolidate `Filter::remove_aliases` into `Expr::unalias_nested` [#11001](https://github.com/apache/datafusion/pull/11001) (alamb) +- Convert `nth_value` to UDAF [#11287](https://github.com/apache/datafusion/pull/11287) (jcsherin) + +**Implemented enhancements:** + +- feat: Add support for Int8 and Int16 data types in data page statistics [#10931](https://github.com/apache/datafusion/pull/10931) (Weijun-H) +- feat: add CliSessionContext trait for cli [#10890](https://github.com/apache/datafusion/pull/10890) (tshauck) +- feat(optimizer): handle partial anchored regex cases and improve doc [#10977](https://github.com/apache/datafusion/pull/10977) (waynexia) +- feat: support uint data page extraction [#11018](https://github.com/apache/datafusion/pull/11018) (tshauck) +- feat: propagate EmptyRelation for more join types [#10963](https://github.com/apache/datafusion/pull/10963) (tshauck) +- feat: Add method to add analyzer rules to SessionContext [#10849](https://github.com/apache/datafusion/pull/10849) (pingsutw) +- feat: Support duplicate column names in Joins in Substrait consumer [#11049](https://github.com/apache/datafusion/pull/11049) (Blizzara) +- feat: Add support for Timestamp data types in data page statistics. [#11123](https://github.com/apache/datafusion/pull/11123) (efredine) +- feat: Add support for `Binary`/`LargeBinary`/`Utf8`/`LargeUtf8` data types in data page statistics [#11136](https://github.com/apache/datafusion/pull/11136) (PsiACE) +- feat: Support Map type in Substrait conversions [#11129](https://github.com/apache/datafusion/pull/11129) (Blizzara) +- feat: Conditionally allow to keep partition_by columns when using PARTITIONED BY enhancement [#11107](https://github.com/apache/datafusion/pull/11107) (hveiga) +- feat: enable "substring" as a UDF in addition to "substr" [#11277](https://github.com/apache/datafusion/pull/11277) (Blizzara) + +**Fixed bugs:** + +- fix: use total ordering in the min & max accumulator for floats [#10627](https://github.com/apache/datafusion/pull/10627) (westonpace) +- fix: Support double quotes in `date_part` [#10833](https://github.com/apache/datafusion/pull/10833) (Weijun-H) +- fix: Ignore nullability of list elements when consuming Substrait [#10874](https://github.com/apache/datafusion/pull/10874) (Blizzara) +- fix: Support `NOT IN ()` via anti join [#10936](https://github.com/apache/datafusion/pull/10936) (akoshchiy) +- fix: CTEs defined in a subquery can escape their scope [#10954](https://github.com/apache/datafusion/pull/10954) (jonahgao) +- fix: Fix the incorrect null joined rows for SMJ outer join with join filter [#10892](https://github.com/apache/datafusion/pull/10892) (viirya) +- fix: gcd returns negative results [#11099](https://github.com/apache/datafusion/pull/11099) (jonahgao) +- fix: LCM panicked due to overflow [#11131](https://github.com/apache/datafusion/pull/11131) (jonahgao) +- fix: Support dictionary type in parquet metadata statistics. [#11169](https://github.com/apache/datafusion/pull/11169) (efredine) +- fix: Ignore nullability in Substrait structs [#11130](https://github.com/apache/datafusion/pull/11130) (Blizzara) +- fix: typo in comment about FinalPhysicalPlan [#11181](https://github.com/apache/datafusion/pull/11181) (c8ef) +- fix: Support Substrait's compound names also for window functions [#11163](https://github.com/apache/datafusion/pull/11163) (Blizzara) +- fix: Incorrect LEFT JOIN evaluation result on OR conditions [#11203](https://github.com/apache/datafusion/pull/11203) (viirya) +- fix: Be more lenient in interpreting input args for builtin window functions [#11199](https://github.com/apache/datafusion/pull/11199) (Blizzara) +- fix: correctly handle Substrait windows with rows bounds (and validate executability of test plans) [#11278](https://github.com/apache/datafusion/pull/11278) (Blizzara) +- fix: When consuming Substrait, temporarily rename clashing duplicate columns [#11329](https://github.com/apache/datafusion/pull/11329) (Blizzara) + +**Documentation updates:** + +- Minor: Clarify `SessionContext::state` docs [#10847](https://github.com/apache/datafusion/pull/10847) (alamb) +- Minor: Update SIGMOD paper reference url [#10860](https://github.com/apache/datafusion/pull/10860) (alamb) +- docs(variance): Correct typos in comments [#10844](https://github.com/apache/datafusion/pull/10844) (pingsutw) +- Add missing code close tick in LiteralGuarantee docs [#10859](https://github.com/apache/datafusion/pull/10859) (adriangb) +- Minor: Add more docs and examples for `Transformed` and `TransformedResult` [#11003](https://github.com/apache/datafusion/pull/11003) (alamb) +- doc: Update links in the documantation [#11044](https://github.com/apache/datafusion/pull/11044) (Weijun-H) +- Minor: Examples cleanup + more docs in pruning example [#11086](https://github.com/apache/datafusion/pull/11086) (alamb) +- Minor: refine documentation pointing to examples [#11110](https://github.com/apache/datafusion/pull/11110) (alamb) +- Fix running in Docker instructions [#11141](https://github.com/apache/datafusion/pull/11141) (findepi) +- docs: add example for custom file format with `COPY TO` [#11174](https://github.com/apache/datafusion/pull/11174) (tshauck) +- Fix docs wordings [#11226](https://github.com/apache/datafusion/pull/11226) (findepi) +- Fix count() docs around including null values [#11293](https://github.com/apache/datafusion/pull/11293) (findepi) + +**Other:** + +- chore: Prepare 39.0.0-rc1 [#10828](https://github.com/apache/datafusion/pull/10828) (andygrove) +- Remove expr_fn::sum and replace them with function stub [#10816](https://github.com/apache/datafusion/pull/10816) (jayzhan211) +- Debug print as many fields as possible for `SessionState` [#10818](https://github.com/apache/datafusion/pull/10818) (lewiszlw) +- Prune Parquet RowGroup in a single call to `PruningPredicate::prune`, update StatisticsExtractor API [#10802](https://github.com/apache/datafusion/pull/10802) (alamb) +- Remove Built-in sum and Rename to lowercase `sum` [#10831](https://github.com/apache/datafusion/pull/10831) (jayzhan211) +- Convert `stddev` and `stddev_pop` to UDAF [#10834](https://github.com/apache/datafusion/pull/10834) (goldmedal) +- Introduce expr builder for aggregate function [#10560](https://github.com/apache/datafusion/pull/10560) (jayzhan211) +- chore: Improve change log generator [#10841](https://github.com/apache/datafusion/pull/10841) (andygrove) +- Support user defined `ParquetAccessPlan` in `ParquetExec`, validation to `ParquetAccessPlan::select` [#10813](https://github.com/apache/datafusion/pull/10813) (alamb) +- Convert `VariancePopulation` to UDAF [#10836](https://github.com/apache/datafusion/pull/10836) (mknaw) +- Convert `approx_median` to UDAF [#10840](https://github.com/apache/datafusion/pull/10840) (goldmedal) +- MINOR: use workspace deps in proto-common (upgrade object store dependency) [#10848](https://github.com/apache/datafusion/pull/10848) (waynexia) +- Minor: add `Window::try_new_with_schema` constructor [#10850](https://github.com/apache/datafusion/pull/10850) (sadboy) +- Add support for reading CSV files with comments [#10467](https://github.com/apache/datafusion/pull/10467) (bbannier) +- Convert approx_distinct to UDAF [#10851](https://github.com/apache/datafusion/pull/10851) (Lordworms) +- minor: add proto-common crate to release instructions [#10858](https://github.com/apache/datafusion/pull/10858) (andygrove) +- Implement TPCH substrait integration teset, support tpch_1 [#10842](https://github.com/apache/datafusion/pull/10842) (Lordworms) +- Remove unecessary passing around of `suffix: &str` in `pruning.rs`'s `RequiredColumns` [#10863](https://github.com/apache/datafusion/pull/10863) (adriangb) +- chore: Make DFSchema::datatype_is_logically_equal function public [#10867](https://github.com/apache/datafusion/pull/10867) (advancedxy) +- Bump braces from 3.0.2 to 3.0.3 in /datafusion/wasmtest/datafusion-wasm-app [#10865](https://github.com/apache/datafusion/pull/10865) (dependabot[bot]) +- Docs: Add `unnest` to SQL Reference [#10839](https://github.com/apache/datafusion/pull/10839) (gloomweaver) +- Support correct output column names and struct field names when consuming/producing Substrait [#10829](https://github.com/apache/datafusion/pull/10829) (Blizzara) +- Make Logical Plans more readable by removing extra aliases [#10832](https://github.com/apache/datafusion/pull/10832) (MohamedAbdeen21) +- Minor: Improve `ListingTable` documentation [#10854](https://github.com/apache/datafusion/pull/10854) (alamb) +- Extending join fuzz tests to support join filtering [#10728](https://github.com/apache/datafusion/pull/10728) (edmondop) +- replace and(_, not(_)) with and_not(\*) [#10885](https://github.com/apache/datafusion/pull/10885) (RTEnzyme) +- Disabling test for semi join with filters [#10887](https://github.com/apache/datafusion/pull/10887) (edmondop) +- Minor: Update `min_statistics` and `max_statistics` to be helpers, update docs [#10866](https://github.com/apache/datafusion/pull/10866) (alamb) +- Remove `Interval` column test // parquet extraction [#10888](https://github.com/apache/datafusion/pull/10888) (marvinlanhenke) +- Minor: SMJ fuzz tests fix for rowcounts [#10891](https://github.com/apache/datafusion/pull/10891) (comphead) +- Move `Count` to `functions-aggregate`, update MSRV to rust 1.75 [#10484](https://github.com/apache/datafusion/pull/10484) (jayzhan211) +- refactor: fetch statistics for a given ParquetMetaData [#10880](https://github.com/apache/datafusion/pull/10880) (NGA-TRAN) +- Move FileSinkExec::metrics to the correct place [#10901](https://github.com/apache/datafusion/pull/10901) (joroKr21) +- Refine ParquetAccessPlan comments and tests [#10896](https://github.com/apache/datafusion/pull/10896) (alamb) +- ci: fix clippy failures on main [#10903](https://github.com/apache/datafusion/pull/10903) (jonahgao) +- Minor: disable flaky fuzz test [#10904](https://github.com/apache/datafusion/pull/10904) (comphead) +- Remove builtin count [#10893](https://github.com/apache/datafusion/pull/10893) (jayzhan211) +- Move Regr\_\* functions to use UDAF [#10898](https://github.com/apache/datafusion/pull/10898) (eejbyfeldt) +- Docs: clarify when the parquet reader will read from object store when using cached metadata [#10909](https://github.com/apache/datafusion/pull/10909) (alamb) +- Minor: Fix `bench.sh tpch data` [#10905](https://github.com/apache/datafusion/pull/10905) (alamb) +- Minor: use venv in benchmark compare [#10894](https://github.com/apache/datafusion/pull/10894) (tmi) +- Support explicit type and name during table creation [#10273](https://github.com/apache/datafusion/pull/10273) (duongcongtoai) +- Simplify Join Partition Rules [#10911](https://github.com/apache/datafusion/pull/10911) (berkaysynnada) +- Move `Literal` to `physical-expr-common` [#10910](https://github.com/apache/datafusion/pull/10910) (lewiszlw) +- chore: update some error messages for clarity [#10916](https://github.com/apache/datafusion/pull/10916) (jeffreyssmith2nd) +- Initial Extract parquet data page statistics API [#10852](https://github.com/apache/datafusion/pull/10852) (marvinlanhenke) +- Add contains function, and support in datafusion substrait consumer [#10879](https://github.com/apache/datafusion/pull/10879) (Lordworms) +- Minor: Improve `arrow_statistics` tests [#10927](https://github.com/apache/datafusion/pull/10927) (alamb) +- Minor: Remove `prefer_hash_join` env variable for clickbench [#10933](https://github.com/apache/datafusion/pull/10933) (jayzhan211) +- Convert ApproxPercentileCont and ApproxPercentileContWithWeight to UDAF [#10917](https://github.com/apache/datafusion/pull/10917) (goldmedal) +- refactor: remove extra default in max rows [#10941](https://github.com/apache/datafusion/pull/10941) (tshauck) +- chore: Improve performance of Parquet statistics conversion [#10932](https://github.com/apache/datafusion/pull/10932) (Weijun-H) +- Add catalog::resolve_table_references [#10876](https://github.com/apache/datafusion/pull/10876) (leoyvens) +- Convert BitAnd, BitOr, BitXor to UDAF [#10930](https://github.com/apache/datafusion/pull/10930) (dharanad) +- refactor: improve PoolType argument handling for CLI [#10940](https://github.com/apache/datafusion/pull/10940) (tshauck) +- Minor: remove potential string copy from Column::from_qualified_name [#10947](https://github.com/apache/datafusion/pull/10947) (alamb) +- Fix: StatisticsConverter `counts` for missing columns [#10946](https://github.com/apache/datafusion/pull/10946) (marvinlanhenke) +- Add initial support for Utf8View and BinaryView types [#10925](https://github.com/apache/datafusion/pull/10925) (XiangpengHao) +- Use shorter aliases in CSE [#10939](https://github.com/apache/datafusion/pull/10939) (peter-toth) +- Substrait support for ParquetExec round trip for simple select [#10949](https://github.com/apache/datafusion/pull/10949) (xinlifoobar) +- Support to unparse `ScalarValue::IntervalMonthDayNano` to String [#10956](https://github.com/apache/datafusion/pull/10956) (goldmedal) +- Minor: Return option from row_group_row_count [#10973](https://github.com/apache/datafusion/pull/10973) (marvinlanhenke) +- Minor: Add routine to debug join fuzz tests [#10970](https://github.com/apache/datafusion/pull/10970) (comphead) +- Support to unparse `ScalarValue::TimestampNanosecond` to String [#10984](https://github.com/apache/datafusion/pull/10984) (goldmedal) +- build(deps-dev): bump ws from 8.14.2 to 8.17.1 in /datafusion/wasmtest/datafusion-wasm-app [#10988](https://github.com/apache/datafusion/pull/10988) (dependabot[bot]) +- Minor: reuse Rows buffer in GroupValuesRows [#10980](https://github.com/apache/datafusion/pull/10980) (alamb) +- Add example for writing SQL analysis using DataFusion structures [#10938](https://github.com/apache/datafusion/pull/10938) (LorrensP-2158466) +- Push down filter for Unnest plan [#10974](https://github.com/apache/datafusion/pull/10974) (jayzhan211) +- Add parquet page stats for float{16, 32, 64} [#10982](https://github.com/apache/datafusion/pull/10982) (tmi) +- Fix `file_stream_provider` example compilation failure on windows [#10975](https://github.com/apache/datafusion/pull/10975) (lewiszlw) +- Stop copying LogicalPlan and Exprs in `CommonSubexprEliminate` (2-3% planning speed improvement) [#10835](https://github.com/apache/datafusion/pull/10835) (alamb) +- chore: Update documentation link in `PhysicalOptimizerRule` comment [#11002](https://github.com/apache/datafusion/pull/11002) (Weijun-H) +- Push down filter plan for unnest on non-unnest column only [#10991](https://github.com/apache/datafusion/pull/10991) (jayzhan211) +- Minor: add test for pushdown past unnest [#11017](https://github.com/apache/datafusion/pull/11017) (alamb) +- Update docs for `protoc` minimum installed version [#11006](https://github.com/apache/datafusion/pull/11006) (jcsherin) +- propagate error instead of panicking on out of bounds in physical-expr/src/analysis.rs [#10992](https://github.com/apache/datafusion/pull/10992) (LorrensP-2158466) +- Add drop_columns to dataframe api [#11010](https://github.com/apache/datafusion/pull/11010) (Omega359) +- Push down filter plan for non-unnest column [#11019](https://github.com/apache/datafusion/pull/11019) (jayzhan211) +- Consider timezones with `UTC` and `+00:00` to be the same [#10960](https://github.com/apache/datafusion/pull/10960) (marvinlanhenke) +- Deprecate `OptimizerRule::try_optimize` [#11022](https://github.com/apache/datafusion/pull/11022) (lewiszlw) +- Relax combine partial final rule [#10913](https://github.com/apache/datafusion/pull/10913) (mustafasrepo) +- Compute gcd with u64 instead of i64 because of overflows [#11036](https://github.com/apache/datafusion/pull/11036) (LorrensP-2158466) +- Add distinct_on to dataframe api [#11012](https://github.com/apache/datafusion/pull/11012) (Omega359) +- chore: add test to show current behavior of `AT TIME ZONE` for string vs. timestamp [#11056](https://github.com/apache/datafusion/pull/11056) (appletreeisyellow) +- Boolean parquet get datapage stat [#11054](https://github.com/apache/datafusion/pull/11054) (LorrensP-2158466) +- Using display_name for Expr::Aggregation [#11020](https://github.com/apache/datafusion/pull/11020) (Lordworms) +- Minor: Convert `Count`'s name to lowercase [#11028](https://github.com/apache/datafusion/pull/11028) (jayzhan211) +- Minor: Move `function::Hint` to `datafusion-expr` crate to avoid physical-expr dependency for `datafusion-function` crate [#11061](https://github.com/apache/datafusion/pull/11061) (jayzhan211) +- Support to unparse ScalarValue::TimestampMillisecond to String [#11046](https://github.com/apache/datafusion/pull/11046) (pingsutw) +- Support to unparse IntervalYearMonth and IntervalDayTime to String [#11065](https://github.com/apache/datafusion/pull/11065) (goldmedal) +- SMJ: fix streaming row concurrency issue for LEFT SEMI filtered join [#11041](https://github.com/apache/datafusion/pull/11041) (comphead) +- Add `advanced_parquet_index.rs` example of index in into parquet files [#10701](https://github.com/apache/datafusion/pull/10701) (alamb) +- Add Expr::column_refs to find column references without copying [#10948](https://github.com/apache/datafusion/pull/10948) (alamb) +- Give `OptimizerRule::try_optimize` default implementation and cleanup duplicated custom implementations [#11059](https://github.com/apache/datafusion/pull/11059) (lewiszlw) +- Fix `FormatOptions::CSV` propagation [#10912](https://github.com/apache/datafusion/pull/10912) (svranesevic) +- Support parsing SQL strings to Exprs [#10995](https://github.com/apache/datafusion/pull/10995) (xinlifoobar) +- Support dictionary data type in array_to_string [#10908](https://github.com/apache/datafusion/pull/10908) (EduardoVega) +- Implement min/max for interval types [#11015](https://github.com/apache/datafusion/pull/11015) (maxburke) +- Improve LIKE performance for Dictionary arrays [#11058](https://github.com/apache/datafusion/pull/11058) (Lordworms) +- handle overflow in gcd and return this as an error [#11057](https://github.com/apache/datafusion/pull/11057) (LorrensP-2158466) +- Convert Correlation to UDAF [#11064](https://github.com/apache/datafusion/pull/11064) (pingsutw) +- Migrate more code from `Expr::to_columns` to `Expr::column_refs` [#11067](https://github.com/apache/datafusion/pull/11067) (alamb) +- decimal support for unparser [#11092](https://github.com/apache/datafusion/pull/11092) (y-f-u) +- Improve `CommonSubexprEliminate` identifier management (10% faster planning) [#10473](https://github.com/apache/datafusion/pull/10473) (peter-toth) +- Change wildcard qualifier type from `String` to `TableReference` [#11073](https://github.com/apache/datafusion/pull/11073) (linhr) +- Allow access to UDTF in `SessionContext` [#11071](https://github.com/apache/datafusion/pull/11071) (linhr) +- Strip table qualifiers from schema in `UNION ALL` for unparser [#11082](https://github.com/apache/datafusion/pull/11082) (phillipleblanc) +- Update ListingTable to use StatisticsConverter [#11068](https://github.com/apache/datafusion/pull/11068) (xinlifoobar) +- to_timestamp functions should preserve timezone [#11038](https://github.com/apache/datafusion/pull/11038) (maxburke) +- Rewrite array operator to function in parser [#11101](https://github.com/apache/datafusion/pull/11101) (jayzhan211) +- Resolve empty relation opt for join types [#11066](https://github.com/apache/datafusion/pull/11066) (LorrensP-2158466) +- Add composed extension codec example [#11095](https://github.com/apache/datafusion/pull/11095) (lewiszlw) +- Minor: Avoid some repetition in to_timestamp [#11116](https://github.com/apache/datafusion/pull/11116) (alamb) +- Minor: fix ScalarValue::new_ten error message (cites one not ten) [#11126](https://github.com/apache/datafusion/pull/11126) (gstvg) +- Deprecate Expr::column_refs [#11115](https://github.com/apache/datafusion/pull/11115) (alamb) +- Overflow in negate operator [#11084](https://github.com/apache/datafusion/pull/11084) (LorrensP-2158466) +- Minor: Add Architectural Goals to the docs [#11109](https://github.com/apache/datafusion/pull/11109) (alamb) +- Fix overflow in pow [#11124](https://github.com/apache/datafusion/pull/11124) (LorrensP-2158466) +- Support to unparse Time scalar value to String [#11121](https://github.com/apache/datafusion/pull/11121) (goldmedal) +- Support to unparse `TimestampSecond` and `TimestampMicrosecond` to String [#11120](https://github.com/apache/datafusion/pull/11120) (goldmedal) +- Add standalone example for `OptimizerRule` [#11087](https://github.com/apache/datafusion/pull/11087) (alamb) +- Fix overflow in factorial [#11134](https://github.com/apache/datafusion/pull/11134) (LorrensP-2158466) +- Temporary Fix: Query error when grouping by case expressions [#11133](https://github.com/apache/datafusion/pull/11133) (jonahgao) +- Fix nullability of return value of array_agg [#11093](https://github.com/apache/datafusion/pull/11093) (eejbyfeldt) +- Support filter for List [#11091](https://github.com/apache/datafusion/pull/11091) (jayzhan211) +- [MINOR]: Fix some minor silent bugs [#11127](https://github.com/apache/datafusion/pull/11127) (mustafasrepo) +- Minor Fix for Logical and Physical Expr Conversions [#11142](https://github.com/apache/datafusion/pull/11142) (berkaysynnada) +- Support Date Parquet Data Page Statistics [#11135](https://github.com/apache/datafusion/pull/11135) (dharanad) +- fix flaky array query slt test [#11140](https://github.com/apache/datafusion/pull/11140) (leoyvens) +- Support Decimal and Decimal256 Parquet Data Page Statistics [#11138](https://github.com/apache/datafusion/pull/11138) (Lordworms) +- Implement comparisons on nested data types such that distinct/except would work [#11117](https://github.com/apache/datafusion/pull/11117) (rtyler) +- Minor: dont panic with bad arguments to round [#10899](https://github.com/apache/datafusion/pull/10899) (tmi) +- Minor: reduce replication for nested comparison [#11149](https://github.com/apache/datafusion/pull/11149) (alamb) +- [Minor]: Remove datafusion-functions-aggregate dependency from physical-expr crate [#11158](https://github.com/apache/datafusion/pull/11158) (mustafasrepo) +- adding config to control Varchar behavior [#11090](https://github.com/apache/datafusion/pull/11090) (Lordworms) +- minor: consolidate `gcd` related tests [#11164](https://github.com/apache/datafusion/pull/11164) (jonahgao) +- Minor: move batch spilling methods to `lib.rs` to make it reusable [#11154](https://github.com/apache/datafusion/pull/11154) (comphead) +- Move schema projection to where it's used in ListingTable [#11167](https://github.com/apache/datafusion/pull/11167) (adriangb) +- Make running in docker instruction be copy-pastable [#11148](https://github.com/apache/datafusion/pull/11148) (findepi) +- Rewrite `array @> array` and `array <@ array` in sql_expr_to_logical_expr [#11155](https://github.com/apache/datafusion/pull/11155) (jayzhan211) +- Minor: make some physical_optimizer rules public [#11171](https://github.com/apache/datafusion/pull/11171) (askalt) +- Remove pr_benchmarks.yml [#11165](https://github.com/apache/datafusion/pull/11165) (alamb) +- Optionally display schema in explain plan [#11177](https://github.com/apache/datafusion/pull/11177) (alamb) +- Minor: Add more support for ScalarValue::Float16 [#11156](https://github.com/apache/datafusion/pull/11156) (Lordworms) +- Minor: fix SQLOptions::with_allow_ddl comments [#11166](https://github.com/apache/datafusion/pull/11166) (alamb) +- Update sqllogictest requirement from 0.20.0 to 0.21.0 [#11189](https://github.com/apache/datafusion/pull/11189) (dependabot[bot]) +- Support Time Parquet Data Page Statistics [#11187](https://github.com/apache/datafusion/pull/11187) (dharanad) +- Adds support for Dictionary data type statistics from parquet data pages. [#11195](https://github.com/apache/datafusion/pull/11195) (efredine) +- [Minor]: Make sort_batch public [#11191](https://github.com/apache/datafusion/pull/11191) (mustafasrepo) +- Introduce user defined SQL planner API [#11180](https://github.com/apache/datafusion/pull/11180) (jayzhan211) +- Covert grouping to udaf [#11147](https://github.com/apache/datafusion/pull/11147) (Rachelint) +- Make statistics_from_parquet_meta a sync function [#11205](https://github.com/apache/datafusion/pull/11205) (adriangb) +- Allow user defined SQL planners to be registered [#11208](https://github.com/apache/datafusion/pull/11208) (samuelcolvin) +- Recursive `unnest` [#11062](https://github.com/apache/datafusion/pull/11062) (duongcongtoai) +- Document how to test examples in user guide, add some more coverage [#11178](https://github.com/apache/datafusion/pull/11178) (alamb) +- Minor: Move MemoryCatalog\*Provider into a module, improve comments [#11183](https://github.com/apache/datafusion/pull/11183) (alamb) +- Add standalone example of using the SQL frontend [#11088](https://github.com/apache/datafusion/pull/11088) (alamb) +- Add Optimizer Sanity Checker, improve sortedness equivalence properties [#11196](https://github.com/apache/datafusion/pull/11196) (mustafasrepo) +- Implement user defined planner for extract [#11215](https://github.com/apache/datafusion/pull/11215) (xinlifoobar) +- Move basic SQL query examples to user guide [#11217](https://github.com/apache/datafusion/pull/11217) (alamb) +- Support FixedSizedBinaryArray Parquet Data Page Statistics [#11200](https://github.com/apache/datafusion/pull/11200) (dharanad) +- Implement ScalarValue::Map [#11224](https://github.com/apache/datafusion/pull/11224) (goldmedal) +- Remove unmaintained python pre-commit configuration [#11255](https://github.com/apache/datafusion/pull/11255) (findepi) +- Enable `clone_on_ref_ptr` clippy lint on execution crate [#11239](https://github.com/apache/datafusion/pull/11239) (lewiszlw) +- Minor: Improve documentation about pushdown join predicates [#11209](https://github.com/apache/datafusion/pull/11209) (alamb) +- Minor: clean up data page statistics tests and fix bugs [#11236](https://github.com/apache/datafusion/pull/11236) (efredine) +- Replacing pattern matching through downcast with trait method [#11257](https://github.com/apache/datafusion/pull/11257) (edmondop) +- Update substrait requirement from 0.34.0 to 0.35.0 [#11206](https://github.com/apache/datafusion/pull/11206) (dependabot[bot]) +- Enhance short circuit handling in `CommonSubexprEliminate` [#11197](https://github.com/apache/datafusion/pull/11197) (peter-toth) +- Add bench for data page statistics parquet extraction [#10950](https://github.com/apache/datafusion/pull/10950) (marvinlanhenke) +- Register SQL planners in `SessionState` constructor [#11253](https://github.com/apache/datafusion/pull/11253) (dharanad) +- Support DuckDB style struct syntax [#11214](https://github.com/apache/datafusion/pull/11214) (jayzhan211) +- Enable `clone_on_ref_ptr` clippy lint on expr crate [#11238](https://github.com/apache/datafusion/pull/11238) (lewiszlw) +- Optimize PushDownFilter to avoid recreating schema columns [#11211](https://github.com/apache/datafusion/pull/11211) (alamb) +- Remove outdated `rewrite_expr.rs` example [#11085](https://github.com/apache/datafusion/pull/11085) (alamb) +- Implement TPCH substrait integration teset, support tpch_2 [#11234](https://github.com/apache/datafusion/pull/11234) (Lordworms) +- Enable `clone_on_ref_ptr` clippy lint on physical-expr crate [#11240](https://github.com/apache/datafusion/pull/11240) (lewiszlw) +- Add standalone `AnalyzerRule` example that implements row level access control [#11089](https://github.com/apache/datafusion/pull/11089) (alamb) +- Replace println! with assert! if possible in DataFusion examples [#11237](https://github.com/apache/datafusion/pull/11237) (Nishi46) +- minor: format `Expr::get_type()` [#11267](https://github.com/apache/datafusion/pull/11267) (jonahgao) +- Fix hash join for nested types [#11232](https://github.com/apache/datafusion/pull/11232) (eejbyfeldt) +- Infer count() aggregation is not null [#11256](https://github.com/apache/datafusion/pull/11256) (findepi) +- Remove unnecessary qualified names [#11292](https://github.com/apache/datafusion/pull/11292) (findepi) +- Fix running examples readme [#11225](https://github.com/apache/datafusion/pull/11225) (findepi) +- Minor: Add `ConstExpr::from` and use in physical optimizer [#11283](https://github.com/apache/datafusion/pull/11283) (alamb) +- Implement TPCH substrait integration teset, support tpch_3 [#11298](https://github.com/apache/datafusion/pull/11298) (Lordworms) +- Implement user defined planner for position [#11243](https://github.com/apache/datafusion/pull/11243) (xinlifoobar) +- Upgrade to arrow 52.1.0 (and fix clippy issues on main) [#11302](https://github.com/apache/datafusion/pull/11302) (alamb) +- AggregateExec: Take grouping sets into account for InputOrderMode [#11301](https://github.com/apache/datafusion/pull/11301) (thinkharderdev) +- Add user_defined_sql_planners(..) to FunctionRegistry [#11296](https://github.com/apache/datafusion/pull/11296) (Omega359) +- use safe cast in propagate_constraints [#11297](https://github.com/apache/datafusion/pull/11297) (Lordworms) +- Minor: Remove clone in optimizer [#11315](https://github.com/apache/datafusion/pull/11315) (jayzhan211) +- minor: Add `PhysicalSortExpr::new` [#11310](https://github.com/apache/datafusion/pull/11310) (andygrove) +- Fix data page statistics when all rows are null in a data page [#11295](https://github.com/apache/datafusion/pull/11295) (efredine) +- Made UserDefinedFunctionPlanner to uniform the usages [#11318](https://github.com/apache/datafusion/pull/11318) (xinlifoobar) +- Implement user defined planner for `create_struct` & `create_named_struct` [#11273](https://github.com/apache/datafusion/pull/11273) (dharanad) +- Improve stats convert performance for Binary/String/Boolean arrays [#11319](https://github.com/apache/datafusion/pull/11319) (Rachelint) +- Fix typos in datafusion-examples/datafusion-cli/docs [#11259](https://github.com/apache/datafusion/pull/11259) (lewiszlw) +- Minor: Fix Failing TPC-DS Test [#11331](https://github.com/apache/datafusion/pull/11331) (berkaysynnada) +- HashJoin can preserve the right ordering when join type is Right [#11276](https://github.com/apache/datafusion/pull/11276) (berkaysynnada) +- Update substrait requirement from 0.35.0 to 0.36.0 [#11328](https://github.com/apache/datafusion/pull/11328) (dependabot[bot]) +- Support to uparse logical plans with timestamp cast to string [#11326](https://github.com/apache/datafusion/pull/11326) (sgrebnov) +- Implement user defined planner for sql_substring_to_expr [#11327](https://github.com/apache/datafusion/pull/11327) (xinlifoobar) +- Improve volatile expression handling in `CommonSubexprEliminate` [#11265](https://github.com/apache/datafusion/pull/11265) (peter-toth) +- Support `IS NULL` and `IS NOT NULL` on Unions [#11321](https://github.com/apache/datafusion/pull/11321) (samuelcolvin) +- Implement TPCH substrait integration test, support tpch_4 and tpch_5 [#11311](https://github.com/apache/datafusion/pull/11311) (Lordworms) +- Enable `clone_on_ref_ptr` clippy lint on physical-plan crate [#11241](https://github.com/apache/datafusion/pull/11241) (lewiszlw) +- Remove any aliases in `Filter::try_new` rather than erroring [#11307](https://github.com/apache/datafusion/pull/11307) (samuelcolvin) +- Improve `DataFrame` Users Guide [#11324](https://github.com/apache/datafusion/pull/11324) (alamb) +- chore: Rename UserDefinedSQLPlanner to ExprPlanner [#11338](https://github.com/apache/datafusion/pull/11338) (andygrove) +- Revert "remove `derive(Copy)` from `Operator` (#11132)" [#11341](https://github.com/apache/datafusion/pull/11341) (alamb) + +## Credits + +Thank you to everyone who contributed to this release. Here is a breakdown of commits (PRs merged) per contributor. + +``` + 41 Andrew Lamb + 17 Jay Zhan + 12 Lordworms + 12 张林伟 + 10 Arttu + 9 Jax Liu + 9 Lorrens Pantelis + 8 Piotr Findeisen + 7 Dharan Aditya + 7 Jonah Gao + 7 Xin Li + 6 Andy Grove + 6 Marvin Lanhenke + 6 Trent Hauck + 5 Alex Huang + 5 Eric Fredine + 5 Mustafa Akur + 5 Oleks V + 5 dependabot[bot] + 4 Adrian Garcia Badaracco + 4 Berkay Şahin + 4 Kevin Su + 4 Peter Toth + 4 Ruihang Xia + 4 Samuel Colvin + 3 Bruce Ritchie + 3 Edmondo Porcu + 3 Emil Ejbyfeldt + 3 Heran Lin + 3 Leonardo Yvens + 3 jcsherin + 3 tmi + 2 Duong Cong Toai + 2 Liang-Chi Hsieh + 2 Max Burke + 2 kamille + 1 Albert Skalt + 1 Andrey Koshchiy + 1 Benjamin Bannier + 1 Bo Lin + 1 Chojan Shang + 1 Chunchun Ye + 1 Dan Harris + 1 Devin D'Angelo + 1 Eduardo Vega + 1 Georgi Krastev + 1 Hector Veiga + 1 Jeffrey Smith II + 1 Kirill Khramkov + 1 Matt Nawara + 1 Mohamed Abdeen + 1 Nga Tran + 1 Nishi + 1 Phillip LeBlanc + 1 R. Tyler Croy + 1 RT_Enzyme + 1 Sava Vranešević + 1 Sergei Grebnov + 1 Weston Pace + 1 Xiangpeng Hao + 1 advancedxy + 1 c8ef + 1 gstvg + 1 yfu +``` + +Thank you also to everyone who contributed in other ways such as filing issues, reviewing PRs, and providing feedback on this release. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 303caef57700..579088f991ef 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -64,7 +64,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.parquet.statistics_enabled | NULL | Sets if statistics are enabled for any column Valid values are: "none", "chunk", and "page" These values are not case sensitive. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_statistics_size | NULL | Sets max statistics size for any column. If NULL, uses default parquet writer setting | | datafusion.execution.parquet.max_row_group_size | 1048576 | Target maximum number of rows in each row group (defaults to 1M rows). Writing larger row groups requires more memory to write, but can get better compression and be faster to read. | -| datafusion.execution.parquet.created_by | datafusion version 39.0.0 | Sets "created by" property | +| datafusion.execution.parquet.created_by | datafusion version 40.0.0 | Sets "created by" property | | datafusion.execution.parquet.column_index_truncate_length | NULL | Sets column index truncate length | | datafusion.execution.parquet.data_page_row_count_limit | 18446744073709551615 | Sets best effort maximum number of rows in data page | | datafusion.execution.parquet.encoding | NULL | Sets default encoding for any column Valid values are: plain, plain_dictionary, rle, bit_packed, delta_binary_packed, delta_length_byte_array, delta_byte_array, rle_dictionary, and byte_stream_split. These values are not case sensitive. If NULL, uses default parquet writer setting |