From db289b3daac1864e19b61deeab1e2070dd0c4967 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 18 Nov 2024 16:55:30 +0800 Subject: [PATCH 1/7] return int for integar date part Signed-off-by: jayzhan211 --- .../functions/src/datetime/date_part.rs | 79 +++++++---- .../sqllogictest/test_files/clickbench.slt | 2 +- datafusion/sqllogictest/test_files/expr.slt | 130 +++++++++--------- .../sqllogictest/test_files/group_by.slt | 4 +- .../sqllogictest/test_files/timestamps.slt | 8 +- 5 files changed, 126 insertions(+), 97 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 01e094bc4e0b..9e1e19ac5502 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -21,10 +21,9 @@ use std::sync::{Arc, OnceLock}; use arrow::array::{Array, ArrayRef, Float64Array}; use arrow::compute::kernels::cast_utils::IntervalUnit; -use arrow::compute::{binary, cast, date_part, DatePart}; +use arrow::compute::{binary, date_part, DatePart}; use arrow::datatypes::DataType::{ - Date32, Date64, Duration, Float64, Interval, Time32, Time64, Timestamp, Utf8, - Utf8View, + Date32, Date64, Duration, Interval, Time32, Time64, Timestamp, Utf8, Utf8View, }; use arrow::datatypes::IntervalUnit::{DayTime, MonthDayNano, YearMonth}; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; @@ -36,11 +35,12 @@ use datafusion_common::cast::{ as_timestamp_microsecond_array, as_timestamp_millisecond_array, as_timestamp_nanosecond_array, as_timestamp_second_array, }; -use datafusion_common::{exec_err, Result, ScalarValue}; +use datafusion_common::{exec_err, internal_err, ExprSchema, Result, ScalarValue}; use datafusion_expr::scalar_doc_sections::DOC_SECTION_DATETIME; use datafusion_expr::TypeSignature::Exact; use datafusion_expr::{ - ColumnarValue, Documentation, ScalarUDFImpl, Signature, Volatility, TIMEZONE_WILDCARD, + ColumnarValue, Documentation, Expr, ScalarUDFImpl, Signature, Volatility, + TIMEZONE_WILDCARD, }; #[derive(Debug)] @@ -148,7 +148,21 @@ impl ScalarUDFImpl for DatePartFunc { } fn return_type(&self, _arg_types: &[DataType]) -> Result { - Ok(Float64) + internal_err!("return_type_from_exprs shoud be called instead") + } + + fn return_type_from_exprs( + &self, + args: &[Expr], + _schema: &dyn ExprSchema, + _arg_types: &[DataType], + ) -> Result { + match &args[0] { + Expr::Literal(ScalarValue::Utf8(Some(part))) if is_integar_part(part) => { + Ok(DataType::Int32) + } + _ => Ok(DataType::Float64), + } } fn invoke(&self, args: &[ColumnarValue]) -> Result { @@ -174,22 +188,18 @@ impl ScalarUDFImpl for DatePartFunc { ColumnarValue::Scalar(scalar) => scalar.to_array()?, }; - // to remove quotes at most 2 characters - let part_trim = part.trim_matches(|c| c == '\'' || c == '\"'); - if ![2, 0].contains(&(part.len() - part_trim.len())) { - return exec_err!("Date part '{part}' not supported"); - } + let part_trim = part_normalization(part); // using IntervalUnit here means we hand off all the work of supporting plurals (like "seconds") // and synonyms ( like "ms,msec,msecond,millisecond") to Arrow let arr = if let Ok(interval_unit) = IntervalUnit::from_str(part_trim) { match interval_unit { - IntervalUnit::Year => date_part_f64(array.as_ref(), DatePart::Year)?, - IntervalUnit::Month => date_part_f64(array.as_ref(), DatePart::Month)?, - IntervalUnit::Week => date_part_f64(array.as_ref(), DatePart::Week)?, - IntervalUnit::Day => date_part_f64(array.as_ref(), DatePart::Day)?, - IntervalUnit::Hour => date_part_f64(array.as_ref(), DatePart::Hour)?, - IntervalUnit::Minute => date_part_f64(array.as_ref(), DatePart::Minute)?, + IntervalUnit::Year => date_part(array.as_ref(), DatePart::Year)?, + IntervalUnit::Month => date_part(array.as_ref(), DatePart::Month)?, + IntervalUnit::Week => date_part(array.as_ref(), DatePart::Week)?, + IntervalUnit::Day => date_part(array.as_ref(), DatePart::Day)?, + IntervalUnit::Hour => date_part(array.as_ref(), DatePart::Hour)?, + IntervalUnit::Minute => date_part(array.as_ref(), DatePart::Minute)?, IntervalUnit::Second => seconds(array.as_ref(), Second)?, IntervalUnit::Millisecond => seconds(array.as_ref(), Millisecond)?, IntervalUnit::Microsecond => seconds(array.as_ref(), Microsecond)?, @@ -200,9 +210,9 @@ impl ScalarUDFImpl for DatePartFunc { } else { // special cases that can be extracted (in postgres) but are not interval units match part_trim.to_lowercase().as_str() { - "qtr" | "quarter" => date_part_f64(array.as_ref(), DatePart::Quarter)?, - "doy" => date_part_f64(array.as_ref(), DatePart::DayOfYear)?, - "dow" => date_part_f64(array.as_ref(), DatePart::DayOfWeekSunday0)?, + "qtr" | "quarter" => date_part(array.as_ref(), DatePart::Quarter)?, + "doy" => date_part(array.as_ref(), DatePart::DayOfYear)?, + "dow" => date_part(array.as_ref(), DatePart::DayOfWeekSunday0)?, "epoch" => epoch(array.as_ref())?, _ => return exec_err!("Date part '{part}' not supported"), } @@ -223,6 +233,30 @@ impl ScalarUDFImpl for DatePartFunc { } } +fn is_integar_part(part: &str) -> bool { + let part = part_normalization(part); + matches!( + part.to_lowercase().as_str(), + "year" + | "month" + | "week" + | "day" + | "hour" + | "minute" + | "qtr" + | "quarter" + | "doy" + | "dow" + ) +} + +// Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error +fn part_normalization(part: &str) -> &str { + part.strip_prefix(|c| c == '\'' || c == '\"') + .and_then(|s| s.strip_suffix(|c| c == '\'' || c == '\"')) + .unwrap_or(part) +} + static DOCUMENTATION: OnceLock = OnceLock::new(); fn get_date_part_doc() -> &'static Documentation { @@ -261,11 +295,6 @@ fn get_date_part_doc() -> &'static Documentation { }) } -/// Invoke [`date_part`] and cast the result to Float64 -fn date_part_f64(array: &dyn Array, part: DatePart) -> Result { - Ok(cast(date_part(array, part)?.as_ref(), &Float64)?) -} - /// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the /// result to a total number of seconds, milliseconds, microseconds or /// nanoseconds diff --git a/datafusion/sqllogictest/test_files/clickbench.slt b/datafusion/sqllogictest/test_files/clickbench.slt index 733c0a3cd972..dfcd92475857 100644 --- a/datafusion/sqllogictest/test_files/clickbench.slt +++ b/datafusion/sqllogictest/test_files/clickbench.slt @@ -136,7 +136,7 @@ SELECT "UserID", "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", "SearchPh 519640690937130534 (empty) 2 7418527520126366595 (empty) 1 -query IRTI rowsort +query IITI rowsort SELECT "UserID", extract(minute FROM to_timestamp_seconds("EventTime")) AS m, "SearchPhrase", COUNT(*) FROM hits GROUP BY "UserID", m, "SearchPhrase" ORDER BY COUNT(*) DESC LIMIT 10; ---- -2461439046089301801 18 (empty) 1 diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 31467072dd3e..8ccc41ed6908 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -840,182 +840,182 @@ SELECT EXTRACT("'''year'''" FROM timestamp '2020-09-08T12:00:00+00:00') query error SELECT EXTRACT("'year'" FROM timestamp '2020-09-08T12:00:00+00:00') -query R +query I SELECT date_part('YEAR', CAST('2000-01-01' AS DATE)) ---- 2000 -query R +query I SELECT EXTRACT(year FROM timestamp '2020-09-08T12:00:00+00:00') ---- 2020 -query R +query I SELECT EXTRACT("year" FROM timestamp '2020-09-08T12:00:00+00:00') ---- 2020 -query R +query I SELECT EXTRACT('year' FROM timestamp '2020-09-08T12:00:00+00:00') ---- 2020 -query R +query I SELECT date_part('QUARTER', CAST('2000-01-01' AS DATE)) ---- 1 -query R +query I SELECT EXTRACT(quarter FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 3 -query R +query I SELECT EXTRACT("quarter" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 3 -query R +query I SELECT EXTRACT('quarter' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 3 -query R +query I SELECT date_part('MONTH', CAST('2000-01-01' AS DATE)) ---- 1 -query R +query I SELECT EXTRACT(month FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 9 -query R +query I SELECT EXTRACT("month" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 9 -query R +query I SELECT EXTRACT('month' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 9 -query R +query I SELECT date_part('WEEK', CAST('2003-01-01' AS DATE)) ---- 1 -query R +query I SELECT EXTRACT(WEEK FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 37 -query R +query I SELECT EXTRACT("WEEK" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 37 -query R +query I SELECT EXTRACT('WEEK' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 37 -query R +query I SELECT date_part('DAY', CAST('2000-01-01' AS DATE)) ---- 1 -query R +query I SELECT EXTRACT(day FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 8 -query R +query I SELECT EXTRACT("day" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 8 -query R +query I SELECT EXTRACT('day' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 8 -query R +query I SELECT date_part('DOY', CAST('2000-01-01' AS DATE)) ---- 1 -query R +query I SELECT EXTRACT(doy FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 252 -query R +query I SELECT EXTRACT("doy" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 252 -query R +query I SELECT EXTRACT('doy' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 252 -query R +query I SELECT date_part('DOW', CAST('2000-01-01' AS DATE)) ---- 6 -query R +query I SELECT EXTRACT(dow FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 2 -query R +query I SELECT EXTRACT("dow" FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 2 -query R +query I SELECT EXTRACT('dow' FROM to_timestamp('2020-09-08T12:00:00+00:00')) ---- 2 -query R +query I SELECT date_part('HOUR', CAST('2000-01-01' AS DATE)) ---- 0 -query R +query I SELECT EXTRACT(hour FROM to_timestamp('2020-09-08T12:03:03+00:00')) ---- 12 -query R +query I SELECT EXTRACT("hour" FROM to_timestamp('2020-09-08T12:03:03+00:00')) ---- 12 -query R +query I SELECT EXTRACT('hour' FROM to_timestamp('2020-09-08T12:03:03+00:00')) ---- 12 -query R +query I SELECT EXTRACT(minute FROM to_timestamp('2020-09-08T12:12:00+00:00')) ---- 12 -query R +query I SELECT EXTRACT("minute" FROM to_timestamp('2020-09-08T12:12:00+00:00')) ---- 12 -query R +query I SELECT EXTRACT('minute' FROM to_timestamp('2020-09-08T12:12:00+00:00')) ---- 12 -query R +query I SELECT date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00')) ---- 12 @@ -1124,22 +1124,22 @@ SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00') # test_date_part_time ## time32 seconds -query R +query I SELECT date_part('hour', arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 23 -query R +query I SELECT extract(hour from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 23 -query R +query I SELECT date_part('minute', arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 32 -query R +query I SELECT extract(minute from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 32 @@ -1195,22 +1195,22 @@ SELECT extract(epoch from arrow_cast('23:32:50'::time, 'Time32(Second)')) 84770 ## time32 milliseconds -query R +query I SELECT date_part('hour', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 23 -query R +query I SELECT extract(hour from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 23 -query R +query I SELECT date_part('minute', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 32 -query R +query I SELECT extract(minute from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 32 @@ -1266,22 +1266,22 @@ SELECT extract(epoch from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)' 84770.123 ## time64 microseconds -query R +query I SELECT date_part('hour', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 23 -query R +query I SELECT extract(hour from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 23 -query R +query I SELECT date_part('minute', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 32 -query R +query I SELECT extract(minute from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 32 @@ -1337,22 +1337,22 @@ SELECT extract(epoch from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecon 84770.123456 ## time64 nanoseconds -query R +query I SELECT date_part('hour', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- 23 -query R +query I SELECT extract(hour from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- 23 -query R +query I SELECT date_part('minute', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- 32 -query R +query I SELECT extract(minute from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- 32 @@ -1487,32 +1487,32 @@ SELECT extract(epoch from arrow_cast('1969-12-31', 'Date64')) # test_extract_interval -query R +query I SELECT extract(year from arrow_cast('10 years', 'Interval(YearMonth)')) ---- 10 -query R +query I SELECT extract(month from arrow_cast('10 years', 'Interval(YearMonth)')) ---- 0 -query R +query I SELECT extract(year from arrow_cast('10 months', 'Interval(YearMonth)')) ---- 0 -query R +query I SELECT extract(month from arrow_cast('10 months', 'Interval(YearMonth)')) ---- 10 -query R +query I SELECT extract(year from arrow_cast('20 months', 'Interval(YearMonth)')) ---- 1 -query R +query I SELECT extract(month from arrow_cast('20 months', 'Interval(YearMonth)')) ---- 8 @@ -1523,17 +1523,17 @@ SELECT extract(year from arrow_cast('10 days', 'Interval(DayTime)')) query error DataFusion error: Arrow error: Compute error: Month does not support: Interval\(DayTime\) SELECT extract(month from arrow_cast('10 days', 'Interval(DayTime)')) -query R +query I SELECT extract(day from arrow_cast('10 days', 'Interval(DayTime)')) ---- 10 -query R +query I SELECT extract(day from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- 0 -query R +query I SELECT extract(minute from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- 14400 @@ -1597,7 +1597,7 @@ create table t (id int, i interval) as values (4, interval '8 months'), (5, NULL); -query IRR rowsort +query IRI select id, extract(second from i), @@ -1652,7 +1652,7 @@ SELECT extract(millisecond from arrow_cast(2002, 'Duration(Millisecond)')) ---- 2002 -query R +query I SELECT extract(day from arrow_cast(864000, 'Duration(Second)')) ---- 10 @@ -1663,7 +1663,7 @@ SELECT extract(month from arrow_cast(864000, 'Duration(Second)')) query error DataFusion error: Arrow error: Compute error: Year does not support: Duration\(Second\) SELECT extract(year from arrow_cast(864000, 'Duration(Second)')) -query R +query I SELECT extract(day from arrow_cast(NULL, 'Duration(Second)')) ---- NULL @@ -2230,7 +2230,7 @@ SELECT digest('','blake3'); ---- af1349b9f5f9a1a6a0404dea36dcc9499bcb25c9adc112b7cc9a93cae41f3262 -# vverify utf8view +# vverify utf8view query ? SELECT sha224(arrow_cast('tom', 'Utf8View')); ---- diff --git a/datafusion/sqllogictest/test_files/group_by.slt b/datafusion/sqllogictest/test_files/group_by.slt index 391f84836871..bc974a57b2db 100644 --- a/datafusion/sqllogictest/test_files/group_by.slt +++ b/datafusion/sqllogictest/test_files/group_by.slt @@ -4294,7 +4294,7 @@ physical_plan 08)--------------RepartitionExec: partitioning=RoundRobinBatch(8), input_partitions=1 09)----------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/core/tests/data/timestamps.csv]]}, projection=[ts], output_ordering=[ts@0 DESC], has_header=false -query R +query I SELECT extract(month from ts) as months FROM csv_with_timestamps GROUP BY extract(month from ts) @@ -4344,7 +4344,7 @@ create table t1(state string, city string, min_temp float, area int, time timest ('MA', 'Boston', 70.4, 1, 50), ('MA', 'Bedford', 71.59, 2, 150); -query RI +query II select date_part('year', time) as bla, count(distinct state) as count from t1 group by bla; ---- 1970 1 diff --git a/datafusion/sqllogictest/test_files/timestamps.slt b/datafusion/sqllogictest/test_files/timestamps.slt index a80036df2ca8..b713008d2c3b 100644 --- a/datafusion/sqllogictest/test_files/timestamps.slt +++ b/datafusion/sqllogictest/test_files/timestamps.slt @@ -1756,13 +1756,13 @@ SELECT date_bin('1 day', TIMESTAMPTZ '2022-01-01 20:10:00Z', TIMESTAMP '2020-01- 2022-01-01T07:00:00+07:00 # postgresql: 1 -query R +query I SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as part ---- 1 # postgresql: 8 -query R +query I SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01Z') as part ---- 8 @@ -1839,13 +1839,13 @@ SELECT date_bin('2 hour', TIMESTAMPTZ '2022-01-01 01:10:00+07', '2020-01-01T00:0 2021-12-31T18:00:00Z # postgresql: 1 -query R +query I SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01') as part ---- 1 # postgresql: 18 -query R +query I SELECT date_part('hour', TIMESTAMPTZ '2000-01-01T01:01:01+07') as part ---- 18 From 96342af0acf39e11685684b570efafe43275be38 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Mon, 18 Nov 2024 18:04:57 +0800 Subject: [PATCH 2/7] fix tpch test Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/tpch/q7.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/q8.slt.part | 2 +- datafusion/sqllogictest/test_files/tpch/q9.slt.part | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/sqllogictest/test_files/tpch/q7.slt.part b/datafusion/sqllogictest/test_files/tpch/q7.slt.part index a16af4710478..92ce48c286be 100644 --- a/datafusion/sqllogictest/test_files/tpch/q7.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q7.slt.part @@ -141,7 +141,7 @@ physical_plan -query TTRR +query TTIR select supp_nation, cust_nation, diff --git a/datafusion/sqllogictest/test_files/tpch/q8.slt.part b/datafusion/sqllogictest/test_files/tpch/q8.slt.part index fd5773438466..225836a4b4d4 100644 --- a/datafusion/sqllogictest/test_files/tpch/q8.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q8.slt.part @@ -163,7 +163,7 @@ physical_plan -query RR +query IR select o_year, cast(cast(sum(case diff --git a/datafusion/sqllogictest/test_files/tpch/q9.slt.part b/datafusion/sqllogictest/test_files/tpch/q9.slt.part index c4910beb842b..8cde946db877 100644 --- a/datafusion/sqllogictest/test_files/tpch/q9.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q9.slt.part @@ -127,7 +127,7 @@ physical_plan -query TRR +query TIR select nation, o_year, From 6be9d9f5cfd92233ec2d32327fed759049630d0f Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Tue, 19 Nov 2024 11:38:36 +0800 Subject: [PATCH 3/7] type test Signed-off-by: jayzhan211 --- datafusion/sqllogictest/test_files/expr.slt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 8ccc41ed6908..1add874ecf3f 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1020,6 +1020,12 @@ SELECT date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00')) ---- 12 +# make sure the return type is integer +query T +SELECT arrow_typeof(date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00'))) +---- +Int32 + query R SELECT EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- From 0c17dbd30d415a3c1e23237f87c7f973e337e940 Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Tue, 19 Nov 2024 20:51:23 +0800 Subject: [PATCH 4/7] Update datafusion/functions/src/datetime/date_part.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Daniƫl Heres --- datafusion/functions/src/datetime/date_part.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 9e1e19ac5502..48b1eb42f38c 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -233,7 +233,7 @@ impl ScalarUDFImpl for DatePartFunc { } } -fn is_integar_part(part: &str) -> bool { +fn is_integer_part(part: &str) -> bool { let part = part_normalization(part); matches!( part.to_lowercase().as_str(), From 8a334d39f333d4cb8320f8b1c28fafd326041af1 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 20 Nov 2024 10:47:45 +0800 Subject: [PATCH 5/7] fix name Signed-off-by: jayzhan211 --- datafusion/functions/src/datetime/date_part.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 48b1eb42f38c..288241607322 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -158,7 +158,7 @@ impl ScalarUDFImpl for DatePartFunc { _arg_types: &[DataType], ) -> Result { match &args[0] { - Expr::Literal(ScalarValue::Utf8(Some(part))) if is_integar_part(part) => { + Expr::Literal(ScalarValue::Utf8(Some(part))) if is_integer_part(part) => { Ok(DataType::Int32) } _ => Ok(DataType::Float64), From 7985684068964ba04f5c3a858a13bf7d518a93d4 Mon Sep 17 00:00:00 2001 From: jayzhan211 Date: Wed, 20 Nov 2024 16:02:22 +0800 Subject: [PATCH 6/7] use int for second Signed-off-by: jayzhan211 --- .../functions/src/datetime/date_part.rs | 86 ++++-- datafusion/sqllogictest/test_files/expr.slt | 244 ++++++++---------- 2 files changed, 167 insertions(+), 163 deletions(-) diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index 288241607322..98c6b659a824 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -19,7 +19,7 @@ use std::any::Any; use std::str::FromStr; use std::sync::{Arc, OnceLock}; -use arrow::array::{Array, ArrayRef, Float64Array}; +use arrow::array::{Array, ArrayRef, Float64Array, Int32Array}; use arrow::compute::kernels::cast_utils::IntervalUnit; use arrow::compute::{binary, date_part, DatePart}; use arrow::datatypes::DataType::{ @@ -158,10 +158,10 @@ impl ScalarUDFImpl for DatePartFunc { _arg_types: &[DataType], ) -> Result { match &args[0] { - Expr::Literal(ScalarValue::Utf8(Some(part))) if is_integer_part(part) => { - Ok(DataType::Int32) + Expr::Literal(ScalarValue::Utf8(Some(part))) if is_epoch(part) => { + Ok(DataType::Float64) } - _ => Ok(DataType::Float64), + _ => Ok(DataType::Int32), } } @@ -200,10 +200,10 @@ impl ScalarUDFImpl for DatePartFunc { IntervalUnit::Day => date_part(array.as_ref(), DatePart::Day)?, IntervalUnit::Hour => date_part(array.as_ref(), DatePart::Hour)?, IntervalUnit::Minute => date_part(array.as_ref(), DatePart::Minute)?, - IntervalUnit::Second => seconds(array.as_ref(), Second)?, - IntervalUnit::Millisecond => seconds(array.as_ref(), Millisecond)?, - IntervalUnit::Microsecond => seconds(array.as_ref(), Microsecond)?, - IntervalUnit::Nanosecond => seconds(array.as_ref(), Nanosecond)?, + IntervalUnit::Second => seconds_as_i32(array.as_ref(), Second)?, + IntervalUnit::Millisecond => seconds_as_i32(array.as_ref(), Millisecond)?, + IntervalUnit::Microsecond => seconds_as_i32(array.as_ref(), Microsecond)?, + IntervalUnit::Nanosecond => seconds_as_i32(array.as_ref(), Nanosecond)?, // century and decade are not supported by `DatePart`, although they are supported in postgres _ => return exec_err!("Date part '{part}' not supported"), } @@ -233,21 +233,9 @@ impl ScalarUDFImpl for DatePartFunc { } } -fn is_integer_part(part: &str) -> bool { +fn is_epoch(part: &str) -> bool { let part = part_normalization(part); - matches!( - part.to_lowercase().as_str(), - "year" - | "month" - | "week" - | "day" - | "hour" - | "minute" - | "qtr" - | "quarter" - | "doy" - | "dow" - ) + matches!(part.to_lowercase().as_str(), "epoch") } // Try to remove quote if exist, if the quote is invalid, return original string and let the downstream function handle the error @@ -298,6 +286,60 @@ fn get_date_part_doc() -> &'static Documentation { /// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the /// result to a total number of seconds, milliseconds, microseconds or /// nanoseconds +fn seconds_as_i32(array: &dyn Array, unit: TimeUnit) -> Result { + // Nanosecond is neither supported in Postgres nor DuckDB, to avoid to deal with overflow and precision issue we don't support nanosecond + if unit == Nanosecond { + return internal_err!("unit {unit:?} not supported"); + } + + let conversion_factor = match unit { + Second => 1_000_000_000, + Millisecond => 1_000_000, + Microsecond => 1_000, + Nanosecond => 1, + }; + + let second_factor = match unit { + Second => 1, + Millisecond => 1_000, + Microsecond => 1_000_000, + Nanosecond => 1_000_000_000, + }; + + let secs = date_part(array, DatePart::Second)?; + // This assumes array is primitive and not a dictionary + let secs = as_int32_array(secs.as_ref())?; + let subsecs = date_part(array, DatePart::Nanosecond)?; + let subsecs = as_int32_array(subsecs.as_ref())?; + + // Special case where there are no nulls. + if subsecs.null_count() == 0 { + let r: Int32Array = binary(secs, subsecs, |secs, subsecs| { + secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor + })?; + Ok(Arc::new(r)) + } else { + // Nulls in secs are preserved, nulls in subsecs are treated as zero to account for the case + // where the number of nanoseconds overflows. + let r: Int32Array = secs + .iter() + .zip(subsecs) + .map(|(secs, subsecs)| { + secs.map(|secs| { + let subsecs = subsecs.unwrap_or(0); + secs * second_factor + (subsecs % 1_000_000_000) / conversion_factor + }) + }) + .collect(); + Ok(Arc::new(r)) + } +} + +/// Invoke [`date_part`] on an `array` (e.g. Timestamp) and convert the +/// result to a total number of seconds, milliseconds, microseconds or +/// nanoseconds +/// +/// Given epoch return f64, this is a duplicated function to optimize for f64 type fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { let sf = match unit { Second => 1_f64, diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 1add874ecf3f..27dd92f35424 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1026,106 +1026,98 @@ SELECT arrow_typeof(date_part('minute', to_timestamp('2020-09-08T12:12:00+00:00' ---- Int32 -query R +query I SELECT EXTRACT(second FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT EXTRACT(millisecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12123 -query R +query I SELECT EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 -query R +query I SELECT EXTRACT("second" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT EXTRACT("millisecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12123 -query R +query I SELECT EXTRACT("microsecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT EXTRACT("nanosecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 -query R +query I SELECT EXTRACT('second' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT EXTRACT('millisecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12123 -query R +query I SELECT EXTRACT('microsecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT EXTRACT('nanosecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 + # Keep precision when coercing Utf8 to Timestamp -query R +query I SELECT date_part('second', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT date_part('millisecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12123 -query R +query I SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 -query R + +query I SELECT date_part('second', '2020-09-08T12:00:12.12345678+00:00') ---- -12.12345678 +12 -query R +query I SELECT date_part('millisecond', '2020-09-08T12:00:12.12345678+00:00') ---- -12123.45678 +12123 -query R +query I SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00') ---- -12123456.78 +12123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00') ----- -12123456780 # test_date_part_time @@ -1150,45 +1142,38 @@ SELECT extract(minute from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 32 -query R +query I SELECT date_part('second', arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50 -query R +query I SELECT extract(second from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50 -query R +query I SELECT date_part('millisecond', arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50000 -query R +query I SELECT extract(millisecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50000 -query R +query I SELECT date_part('microsecond', arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50000000 -query R +query I SELECT extract(microsecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50000000 -query R -SELECT date_part('nanosecond', arrow_cast('23:32:50'::time, 'Time32(Second)')) ----- -50000000000 - -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT extract(nanosecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) ----- -50000000000 query R SELECT date_part('epoch', arrow_cast('23:32:50'::time, 'Time32(Second)')) @@ -1221,45 +1206,38 @@ SELECT extract(minute from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond) ---- 32 -query R +query I SELECT date_part('second', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- -50.123 +50 -query R +query I SELECT extract(second from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- -50.123 +50 -query R +query I SELECT date_part('millisecond', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 50123 -query R +query I SELECT extract(millisecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 50123 -query R +query I SELECT date_part('microsecond', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 50123000 -query R +query I SELECT extract(microsecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ---- 50123000 -query R -SELECT date_part('nanosecond', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ----- -50123000000 - -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT extract(nanosecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) ----- -50123000000 query R SELECT date_part('epoch', arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) @@ -1292,45 +1270,38 @@ SELECT extract(minute from arrow_cast('23:32:50.123456'::time, 'Time64(Microseco ---- 32 -query R +query I SELECT date_part('second', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- -50.123456 +50 -query R +query I SELECT extract(second from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- -50.123456 +50 -query R +query I SELECT date_part('millisecond', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- -50123.456 +50123 -query R +query I SELECT extract(millisecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- -50123.456 +50123 -query R +query I SELECT date_part('microsecond', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 50123456 -query R +query I SELECT extract(microsecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ---- 50123456 -query R -SELECT date_part('nanosecond', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ----- -50123456000 - -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT extract(nanosecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) ----- -50123456000 query R SELECT date_part('epoch', arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) @@ -1363,61 +1334,54 @@ SELECT extract(minute from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanose ---- 32 -query R +query I SELECT date_part('second', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50.123456789 +50 -query R +query I select extract(second from '2024-08-09T12:13:14') ---- 14 -query R +query I select extract(seconds from '2024-08-09T12:13:14') ---- 14 -query R +query I SELECT extract(second from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50.123456789 +50 -query R +query I SELECT date_part('millisecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50123.456789 +50123 -query R +query I SELECT extract(millisecond from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50123.456789 +50123 # just some floating point stuff happening in the result here -query R +query I SELECT date_part('microsecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50123456.789000005 +50123456 -query R +query I SELECT extract(microsecond from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50123456.789000005 +50123456 -query R +query I SELECT extract(us from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ---- -50123456.789000005 +50123456 -query R +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT date_part('nanosecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ----- -50123456789 - -query R -SELECT extract(nanosecond from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) ----- -50123456789 query R SELECT date_part('epoch', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) @@ -1544,32 +1508,32 @@ SELECT extract(minute from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- 14400 -query R +query I SELECT extract(second from arrow_cast('5.1 seconds', 'Interval(DayTime)')) ---- 5 -query R +query I SELECT extract(second from arrow_cast('14400 minutes', 'Interval(DayTime)')) ---- 864000 -query R +query I SELECT extract(second from arrow_cast('2 months', 'Interval(MonthDayNano)')) ---- 0 -query R +query I SELECT extract(second from arrow_cast('2 days', 'Interval(MonthDayNano)')) ---- 0 -query R +query I SELECT extract(second from arrow_cast('2 seconds', 'Interval(MonthDayNano)')) ---- 2 -query R +query I SELECT extract(seconds from arrow_cast('2 seconds', 'Interval(MonthDayNano)')) ---- 2 @@ -1579,17 +1543,17 @@ SELECT extract(epoch from arrow_cast('2 seconds', 'Interval(MonthDayNano)')) ---- 2 -query R +query I SELECT extract(milliseconds from arrow_cast('2 seconds', 'Interval(MonthDayNano)')) ---- 2000 -query R +query I SELECT extract(second from arrow_cast('2030 milliseconds', 'Interval(MonthDayNano)')) ---- -2.03 +2 -query R +query I SELECT extract(second from arrow_cast(NULL, 'Interval(MonthDayNano)')) ---- NULL @@ -1603,7 +1567,7 @@ create table t (id int, i interval) as values (4, interval '8 months'), (5, NULL); -query IRI +query III select id, extract(second from i), @@ -1611,9 +1575,9 @@ select from t order by id; ---- -0 0.00000001 5 +0 0 5 1 0 15 -2 0.002 0 +2 0 0 3 2 0 4 0 8 5 NULL NULL @@ -1623,12 +1587,12 @@ drop table t; # test_extract_duration -query R +query I SELECT extract(second from arrow_cast(2, 'Duration(Second)')) ---- 2 -query R +query I SELECT extract(seconds from arrow_cast(2, 'Duration(Second)')) ---- 2 @@ -1638,22 +1602,22 @@ SELECT extract(epoch from arrow_cast(2, 'Duration(Second)')) ---- 2 -query R +query I SELECT extract(millisecond from arrow_cast(2, 'Duration(Second)')) ---- 2000 -query R +query I SELECT extract(second from arrow_cast(2, 'Duration(Millisecond)')) ---- -0.002 +0 -query R +query I SELECT extract(second from arrow_cast(2002, 'Duration(Millisecond)')) ---- -2.002 +2 -query R +query I SELECT extract(millisecond from arrow_cast(2002, 'Duration(Millisecond)')) ---- 2002 @@ -1726,10 +1690,8 @@ SELECT (date_part('microsecond', now()) = EXTRACT(microsecond FROM now())) ---- true -query B +query error DataFusion error: Internal error: unit Nanosecond not supported. SELECT (date_part('nanosecond', now()) = EXTRACT(nanosecond FROM now())) ----- -true query B SELECT 'a' IN ('a','b') From 68adb4ee7045d7df44dd07ce96c691adfd9d905a Mon Sep 17 00:00:00 2001 From: Jay Zhan Date: Thu, 21 Nov 2024 08:02:46 +0800 Subject: [PATCH 7/7] rm dot Signed-off-by: Jay Zhan --- datafusion/sqllogictest/test_files/expr.slt | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/datafusion/sqllogictest/test_files/expr.slt b/datafusion/sqllogictest/test_files/expr.slt index 27dd92f35424..499d279515c3 100644 --- a/datafusion/sqllogictest/test_files/expr.slt +++ b/datafusion/sqllogictest/test_files/expr.slt @@ -1041,7 +1041,7 @@ SELECT EXTRACT(microsecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT EXTRACT(nanosecond FROM timestamp '2020-09-08T12:00:12.12345678+00:00') query I @@ -1059,7 +1059,7 @@ SELECT EXTRACT("microsecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00' ---- 12123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT EXTRACT("nanosecond" FROM timestamp '2020-09-08T12:00:12.12345678+00:00') query I @@ -1077,7 +1077,7 @@ SELECT EXTRACT('microsecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00' ---- 12123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT EXTRACT('nanosecond' FROM timestamp '2020-09-08T12:00:12.12345678+00:00') @@ -1097,7 +1097,7 @@ SELECT date_part('microsecond', timestamp '2020-09-08T12:00:12.12345678+00:00') ---- 12123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', timestamp '2020-09-08T12:00:12.12345678+00:00') @@ -1116,7 +1116,7 @@ SELECT date_part('microsecond', '2020-09-08T12:00:12.12345678+00:00') ---- 12123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', '2020-09-08T12:00:12.12345678+00:00') # test_date_part_time @@ -1172,7 +1172,7 @@ SELECT extract(microsecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) ---- 50000000 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT extract(nanosecond from arrow_cast('23:32:50'::time, 'Time32(Second)')) query R @@ -1236,7 +1236,7 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123'::time, 'Time32(Millise ---- 50123000 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT extract(nanosecond from arrow_cast('23:32:50.123'::time, 'Time32(Millisecond)')) query R @@ -1300,7 +1300,7 @@ SELECT extract(microsecond from arrow_cast('23:32:50.123456'::time, 'Time64(Micr ---- 50123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT extract(nanosecond from arrow_cast('23:32:50.123456'::time, 'Time64(Microsecond)')) query R @@ -1380,7 +1380,7 @@ SELECT extract(us from arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond ---- 50123456 -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT date_part('nanosecond', arrow_cast('23:32:50.123456789'::time, 'Time64(Nanosecond)')) query R @@ -1690,7 +1690,7 @@ SELECT (date_part('microsecond', now()) = EXTRACT(microsecond FROM now())) ---- true -query error DataFusion error: Internal error: unit Nanosecond not supported. +query error DataFusion error: Internal error: unit Nanosecond not supported SELECT (date_part('nanosecond', now()) = EXTRACT(nanosecond FROM now())) query B