Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add native stringview support for LTRIM & RTRIM #11948

Merged
merged 5 commits into from
Aug 13, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 17 additions & 4 deletions datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Left, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Left, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,16 @@ impl LtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +87,18 @@ impl ScalarUDFImpl for LtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
ltrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
ltrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function ltrim"),
other => exec_err!(
"Unsupported data type {other:?} for function ltrim,\
expected for Utf8, LargeUtf8 or Utf8View."
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
),
}
}
}
21 changes: 17 additions & 4 deletions datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Right, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Right, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,16 @@ impl RtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +87,18 @@ impl ScalarUDFImpl for RtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
rtrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
rtrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function rtrim"),
other => exec_err!(
"Unsupported data type {other:?} for function rtrim,\
expected for Utf8, LargeUtf8 or Utf8View."
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
),
}
}
}
126 changes: 91 additions & 35 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,97 @@ Xiangpeng Xiangpeng Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL

## Ensure no casts for LTRIM
# Test LTRIM with Utf8View input
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM outputs
query TTTT
SELECT
LTRIM(column1_utf8view, 'foo') AS l1,
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
LTRIM(column1_utf8view, column2_utf8view) AS l2,
LTRIM(column1_utf8view) AS l3,
LTRIM(column1_utf8view, NULL) AS l4
FROM test;
----
Andrew Andrew Andrew NULL
Xiangpeng (empty) Xiangpeng NULL
Raphael aphael Raphael NULL
NULL NULL NULL NULL

## ensure no casts for RTRIM
# Test RTRIM with Utf8View input
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM outputs
query TTTT
SELECT
RTRIM(column1_utf8view, 'foo') AS l1,
Kev1n8 marked this conversation as resolved.
Show resolved Hide resolved
RTRIM(column1_utf8view, column2_utf8view) AS l2,
RTRIM(column1_utf8view) AS l3,
RTRIM(column1_utf8view, NULL) AS l4
FROM test;
----
Andrew Andrew Andrew NULL
Xiangpeng (empty) Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL


## Ensure no casts for CHARACTER_LENGTH
query TT
EXPLAIN SELECT
Expand Down Expand Up @@ -685,16 +776,6 @@ logical_plan
01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LTRIM
## TODO https://github.com/apache/datafusion/issues/11856
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LPAD
## TODO https://github.com/apache/datafusion/issues/11857
Expand Down Expand Up @@ -795,18 +876,6 @@ logical_plan
01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c1,
RTRIM(column1_utf8view, 'foo') as c2
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1
03)----TableScan: test projection=[column1_utf8view]

## Ensure no casts for RIGHT
## TODO file ticket
Expand All @@ -833,19 +902,6 @@ logical_plan
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]


## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c,
RTRIM(column1_utf8view, column2_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SPLIT_PART
## TODO file ticket
query TT
Expand Down