diff --git a/datafusion/functions/src/regex/regexplike.rs b/datafusion/functions/src/regex/regexplike.rs index 09b96a28c107..20029ba005c4 100644 --- a/datafusion/functions/src/regex/regexplike.rs +++ b/datafusion/functions/src/regex/regexplike.rs @@ -75,13 +75,10 @@ impl ScalarUDFImpl for RegexpLikeFunc { use DataType::*; Ok(match &arg_types[0] { - LargeUtf8 | Utf8 => Boolean, Null => Null, - other => { - return plan_err!( - "The regexp_like function can only accept strings. Got {other}" - ); - } + // Type coercion is done by DataFusion based on signature, so if we + // get here, the first argument is always a string + _ => Boolean, }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/regex/regexpmatch.rs b/datafusion/functions/src/regex/regexpmatch.rs index f57d3c17bd72..764acd7de757 100644 --- a/datafusion/functions/src/regex/regexpmatch.rs +++ b/datafusion/functions/src/regex/regexpmatch.rs @@ -74,17 +74,9 @@ impl ScalarUDFImpl for RegexpMatchFunc { } fn return_type(&self, arg_types: &[DataType]) -> Result { - use DataType::*; - Ok(match &arg_types[0] { - LargeUtf8 => List(Arc::new(Field::new("item", LargeUtf8, true))), - Utf8 => List(Arc::new(Field::new("item", Utf8, true))), - Null => Null, - other => { - return plan_err!( - "The regexp_match function can only accept strings. Got {other}" - ); - } + DataType::Null => DataType::Null, + other => DataType::List(Arc::new(Field::new("item", other.clone(), true))), }) } fn invoke(&self, args: &[ColumnarValue]) -> Result { diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index cee1a57bc6d9..e46ee162ff12 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -44,7 +44,7 @@ impl CharacterLengthFunc { Self { signature: Signature::uniform( 1, - vec![Utf8, LargeUtf8], + vec![Utf8, LargeUtf8, Utf8View], Volatility::Immutable, ), aliases: vec![String::from("length"), String::from("char_length")], diff --git a/datafusion/sqllogictest/test_files/string_view.slt b/datafusion/sqllogictest/test_files/string_view.slt index fc10a34256c5..e7166690580f 100644 --- a/datafusion/sqllogictest/test_files/string_view.slt +++ b/datafusion/sqllogictest/test_files/string_view.slt @@ -425,102 +425,26 @@ logical_plan 01)Projection: starts_with(test.column1_utf8view, Utf8View("äöüß")) AS c1, starts_with(test.column1_utf8view, Utf8View("")) AS c2, starts_with(test.column1_utf8view, Utf8View(NULL)) AS c3, starts_with(Utf8View(NULL), test.column1_utf8view) AS c4 02)--TableScan: test projection=[column1_utf8view] -statement ok -drop table test; - -# coercion from stringview to integer, as input to make_date -query D -select make_date(arrow_cast('2024', 'Utf8View'), arrow_cast('01', 'Utf8View'), arrow_cast('23', 'Utf8View')) ----- -2024-01-23 - -# coercions between stringview and date types -statement ok -create table dates (dt date) as values - (date '2024-01-23'), - (date '2023-11-30'); - -query D -select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; ----- -2024-01-23 - -statement ok -drop table dates; - -statement ok -create table temp as values -('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), -('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); - -query T -select column2||' is fast' from temp; ----- -rust is fast -datafusion is fast - - -query T -select column2 || ' is ' || column3 from temp; ----- -rust is fast -datafusion is cool +# Ensure string functions use native StringView implementation +# and do not fall back to Utf8 or LargeUtf8 +# Should see no casts to Utf8 in the plans below +## Ensure no casts for LIKE/ILIKE query TT -explain select column2 || 'is' || column3 from temp; ----- -logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || Utf8("is") || CAST(temp.column3 AS Utf8) -02)--TableScan: temp projection=[column2, column3] - - -query TT -explain select column2||' is fast' from temp; +EXPLAIN SELECT + column1_utf8view like 'foo' as "like", + column1_utf8view ilike 'foo' as "ilike" +FROM test; ---- logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || Utf8(" is fast") -02)--TableScan: temp projection=[column2] - +01)Projection: test.column1_utf8view LIKE Utf8View("foo") AS like, test.column1_utf8view ILIKE Utf8View("foo") AS ilike +02)--TableScan: test projection=[column1_utf8view] -query T -select column2||column3 from temp; ----- -rustfast -datafusioncool -query TT -explain select column2||column3 from temp; ----- -logical_plan -01)Projection: CAST(temp.column2 AS Utf8) || CAST(temp.column3 AS Utf8) -02)--TableScan: temp projection=[column2, column3] -query T -select column2|| ' ' ||column3 from temp; ----- -rust fast -datafusion cool +## Ensure no casts for ASCII ### ASCII -# Setup the initial test data -statement ok -create table test_source as values - ('Andrew', 'X'), - ('Xiangpeng', 'Xiangpeng'), - ('Raphael', 'R'), - (NULL, 'R'); - -# Table with the different combination of column types -statement ok -create table test as -SELECT - arrow_cast(column1, 'Utf8') as column1_utf8, - arrow_cast(column2, 'Utf8') as column2_utf8, - arrow_cast(column1, 'LargeUtf8') as column1_large_utf8, - arrow_cast(column2, 'LargeUtf8') as column2_large_utf8, - arrow_cast(column1, 'Utf8View') as column1_utf8view, - arrow_cast(column2, 'Utf8View') as column2_utf8view -FROM test_source; # Test ASCII with utf8view against utf8view, utf8, and largeutf8 # (should be no casts) @@ -594,8 +518,417 @@ SELECT ---- 228 0 NULL +## Ensure no casts for BTRIM +query TT +EXPLAIN SELECT + BTRIM(column1_utf8view, 'foo') AS l +FROM test; +---- +logical_plan +01)Projection: btrim(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CHARACTER_LENGTH +query TT +EXPLAIN SELECT + CHARACTER_LENGTH(column1_utf8view) AS l +FROM test; +---- +logical_plan +01)Projection: character_length(test.column1_utf8view) AS l +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for CONCAT +## TODO https://github.com/apache/datafusion/issues/11836 +query TT +EXPLAIN SELECT + concat(column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat(CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONCAT_WS +## TODO https://github.com/apache/datafusion/issues/11837 +query TT +EXPLAIN SELECT + concat_ws(', ', column1_utf8view, column2_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: concat_ws(Utf8(", "), CAST(test.column1_utf8view AS Utf8), CAST(test.column2_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for CONTAINS +## TODO https://github.com/apache/datafusion/issues/11838 +query TT +EXPLAIN SELECT + CONTAINS(column1_utf8view, 'foo') as c1, + CONTAINS(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: contains(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, contains(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for ENDS_WITH +## TODO https://github.com/apache/datafusion/issues/11852 +query TT +EXPLAIN SELECT + ENDS_WITH(column1_utf8view, 'foo') as c1, + ENDS_WITH(column2_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: ends_with(CAST(test.column1_utf8view AS Utf8), Utf8("foo")) AS c1, ends_with(__common_expr_1, __common_expr_1) AS c2 +02)--Projection: CAST(test.column2_utf8view AS Utf8) AS __common_expr_1, test.column1_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for INITCAP +## TODO https://github.com/apache/datafusion/issues/11853 +query TT +EXPLAIN SELECT + INITCAP(column1_utf8view) as c +FROM test; +---- +logical_plan +01)Projection: initcap(CAST(test.column1_utf8view AS Utf8)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LEVENSHTEIN +## TODO https://github.com/apache/datafusion/issues/11854 +query TT +EXPLAIN SELECT + levenshtein(column1_utf8view, 'foo') as c1, + levenshtein(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: levenshtein(__common_expr_1, Utf8("foo")) AS c1, levenshtein(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for LOWER +## TODO https://github.com/apache/datafusion/issues/11855 +query TT +EXPLAIN SELECT + LOWER(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LTRIM +## TODO https://github.com/apache/datafusion/issues/11856 +query TT +EXPLAIN SELECT + LTRIM(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for LPAD +## TODO https://github.com/apache/datafusion/issues/11857 +query TT +EXPLAIN SELECT + LPAD(column1_utf8view, 12, ' ') as c1 +FROM test; +---- +logical_plan +01)Projection: lpad(CAST(test.column1_utf8view AS Utf8), Int64(12), Utf8(" ")) AS c1 +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for OCTET_LENGTH +## TODO https://github.com/apache/datafusion/issues/11858 +query TT +EXPLAIN SELECT + OCTET_LENGTH(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: octet_length(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for OVERLAY +## TODO file ticket +query TT +EXPLAIN SELECT + OVERLAY(column1_utf8view PLACING 'foo' FROM 2 ) as c1 +FROM test; +---- +logical_plan +01)Projection: overlay(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_LIKE +query TT +EXPLAIN SELECT + REGEXP_LIKE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_like(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_MATCH +query TT +EXPLAIN SELECT + REGEXP_MATCH(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_match(CAST(test.column1_utf8view AS Utf8), Utf8("^https?://(?:www\.)?([^/]+)/.*$")) AS k +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REGEXP_REPLACE +query TT +EXPLAIN SELECT + REGEXP_REPLACE(column1_utf8view, '^https?://(?:www\.)?([^/]+)/.*$', '\1') AS k +FROM test; +---- +logical_plan +01)Projection: regexp_replace(test.column1_utf8view, Utf8("^https?://(?:www\.)?([^/]+)/.*$"), Utf8("\1")) AS k +02)--TableScan: test projection=[column1_utf8view] + + +## Ensure no casts for REPEAT +## TODO file ticket +query TT +EXPLAIN SELECT + REPEAT(column1_utf8view, 2) as c1 +FROM test; +---- +logical_plan +01)Projection: repeat(CAST(test.column1_utf8view AS Utf8), Int64(2)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for REPLACE +## TODO file ticket +query TT +EXPLAIN SELECT + REPLACE(column1_utf8view, 'foo', 'bar') as c1, + REPLACE(column1_utf8view, column2_utf8view, 'bar') as c2 +FROM test; +---- +logical_plan +01)Projection: replace(__common_expr_1, Utf8("foo"), Utf8("bar")) AS c1, replace(__common_expr_1, CAST(test.column2_utf8view AS Utf8), Utf8("bar")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for REVERSE +## TODO file ticket +query TT +EXPLAIN SELECT + REVERSE(column1_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c1, + RTRIM(column1_utf8view, 'foo') as c2 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RIGHT +## TODO file ticket +query TT +EXPLAIN SELECT + RIGHT(column1_utf8view, 3) as c2 +FROM test; +---- +logical_plan +01)Projection: right(CAST(test.column1_utf8view AS Utf8), Int64(3)) AS c2 +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for RPAD +## TODO file ticket +query TT +EXPLAIN SELECT + RPAD(column1_utf8view, 1) as c1, + RPAD(column1_utf8view, 2, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: rpad(__common_expr_1, Int64(1)) AS c1, rpad(__common_expr_1, Int64(2), CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + + +## Ensure no casts for RTRIM +## TODO file ticket +query TT +EXPLAIN SELECT + RTRIM(column1_utf8view) as c, + RTRIM(column1_utf8view, column2_utf8view) as c1 +FROM test; +---- +logical_plan +01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SPLIT_PART +## TODO file ticket +query TT +EXPLAIN SELECT + SPLIT_PART(column1_utf8view, 'f', 1) as c +FROM test; +---- +logical_plan +01)Projection: split_part(CAST(test.column1_utf8view AS Utf8), Utf8("f"), Int64(1)) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for STRPOS +## TODO file ticket +query TT +EXPLAIN SELECT + STRPOS(column1_utf8view, 'f') as c, + STRPOS(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: strpos(__common_expr_1, Utf8("f")) AS c, strpos(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view +03)----TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for SUBSTR +## TODO file ticket +query TT +EXPLAIN SELECT + SUBSTR(column1_utf8view, 1) as c, + SUBSTR(column1_utf8view, 1 ,2) as c2 +FROM test; +---- +logical_plan +01)Projection: substr(__common_expr_1, Int64(1)) AS c, substr(__common_expr_1, Int64(1), Int64(2)) AS c2 +02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1 +03)----TableScan: test projection=[column1_utf8view] + +## Ensure no casts on columns for STARTS_WITH +query TT +EXPLAIN SELECT + STARTS_WITH(column1_utf8view, 'foo') as c, + STARTS_WITH(column1_utf8view, column2_utf8view) as c2 +FROM test; +---- +logical_plan +01)Projection: starts_with(test.column1_utf8view, Utf8View("foo")) AS c, starts_with(test.column1_utf8view, test.column2_utf8view) AS c2 +02)--TableScan: test projection=[column1_utf8view, column2_utf8view] + +## Ensure no casts for TRANSLATE +## TODO file ticket +query TT +EXPLAIN SELECT + TRANSLATE(column1_utf8view, 'foo', 'bar') as c +FROM test; +---- +logical_plan +01)Projection: translate(CAST(test.column1_utf8view AS Utf8), Utf8("foo"), Utf8("bar")) AS c +02)--TableScan: test projection=[column1_utf8view] + +## Ensure no casts for FIND_IN_SET +## TODO file ticket +query TT +EXPLAIN SELECT + FIND_IN_SET(column1_utf8view, 'a,b,c,d') as c +FROM test; +---- +logical_plan +01)Projection: find_in_set(CAST(test.column1_utf8view AS Utf8), Utf8("a,b,c,d")) AS c +02)--TableScan: test projection=[column1_utf8view] + + + + statement ok drop table test; +# coercion from stringview to integer, as input to make_date +query D +select make_date(arrow_cast('2024', 'Utf8View'), arrow_cast('01', 'Utf8View'), arrow_cast('23', 'Utf8View')) +---- +2024-01-23 + +# coercions between stringview and date types +statement ok +create table dates (dt date) as values + (date '2024-01-23'), + (date '2023-11-30'); + +query D +select t.dt from dates t where arrow_cast('2024-01-01', 'Utf8View') < t.dt; +---- +2024-01-23 + +statement ok +drop table dates; + statement ok -drop table test_source; +create table temp as values +('value1', arrow_cast('rust', 'Utf8View'), arrow_cast('fast', 'Utf8View')), +('value2', arrow_cast('datafusion', 'Utf8View'), arrow_cast('cool', 'Utf8View')); + +query T +select column2||' is fast' from temp; +---- +rust is fast +datafusion is fast + + +query T +select column2 || ' is ' || column3 from temp; +---- +rust is fast +datafusion is cool + +query TT +explain select column2 || 'is' || column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8("is") || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + + +query TT +explain select column2||' is fast' from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || Utf8(" is fast") +02)--TableScan: temp projection=[column2] + + +query T +select column2||column3 from temp; +---- +rustfast +datafusioncool + +query TT +explain select column2||column3 from temp; +---- +logical_plan +01)Projection: CAST(temp.column2 AS Utf8) || CAST(temp.column3 AS Utf8) +02)--TableScan: temp projection=[column2, column3] + +query T +select column2|| ' ' ||column3 from temp; +---- +rust fast +datafusion cool