Skip to content

Commit

Permalink
Add native stringview support for LTRIM & RTRIM (#11948)
Browse files Browse the repository at this point in the history
* add stringview option for ltrim

* add stringview option for rtrim

* add some tests to ensure no casts for ltrim & rtrim when using stringview

* fix typo and remove useless comments

* add tests covering ltrim and rtrim functioning
  • Loading branch information
Kev1n8 authored Aug 13, 2024
1 parent 508da80 commit e8ac93a
Show file tree
Hide file tree
Showing 4 changed files with 126 additions and 45 deletions.
3 changes: 1 addition & 2 deletions datafusion/functions/src/string/btrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,6 @@ impl BTrimFunc {
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
// Exact(vec![Utf8, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
Expand Down Expand Up @@ -98,7 +97,7 @@ impl ScalarUDFImpl for BTrimFunc {
)(args),
other => exec_err!(
"Unsupported data type {other:?} for function btrim,\
expected for Utf8, LargeUtf8 or Utf8View."
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
Expand Down
20 changes: 16 additions & 4 deletions datafusion/functions/src/string/ltrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with leading characters removed. If the characters are not specified, whitespace is removed.
/// ltrim('zzzytest', 'xyz') = 'test'
fn ltrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Left, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Left, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,15 @@ impl LtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +86,18 @@ impl ScalarUDFImpl for LtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
ltrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
ltrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function ltrim"),
other => exec_err!(
"Unsupported data type {other:?} for function ltrim,\
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
}
20 changes: 16 additions & 4 deletions datafusion/functions/src/string/rtrim.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,8 @@ use crate::utils::{make_scalar_function, utf8_to_str_type};
/// Returns the longest string with trailing characters removed. If the characters are not specified, whitespace is removed.
/// rtrim('testxxzx', 'xyz') = 'test'
fn rtrim<T: OffsetSizeTrait>(args: &[ArrayRef]) -> Result<ArrayRef> {
general_trim::<T>(args, TrimType::Right, false)
let use_string_view = args[0].data_type() == &DataType::Utf8View;
general_trim::<T>(args, TrimType::Right, use_string_view)
}

#[derive(Debug)]
Expand All @@ -51,7 +52,15 @@ impl RtrimFunc {
use DataType::*;
Self {
signature: Signature::one_of(
vec![Exact(vec![Utf8]), Exact(vec![Utf8, Utf8])],
vec![
// Planner attempts coercion to the target type starting with the most preferred candidate.
// For example, given input `(Utf8View, Utf8)`, it first tries coercing to `(Utf8View, Utf8View)`.
// If that fails, it proceeds to `(Utf8, Utf8)`.
Exact(vec![Utf8View, Utf8View]),
Exact(vec![Utf8, Utf8]),
Exact(vec![Utf8View]),
Exact(vec![Utf8]),
],
Volatility::Immutable,
),
}
Expand All @@ -77,15 +86,18 @@ impl ScalarUDFImpl for RtrimFunc {

fn invoke(&self, args: &[ColumnarValue]) -> Result<ColumnarValue> {
match args[0].data_type() {
DataType::Utf8 => make_scalar_function(
DataType::Utf8 | DataType::Utf8View => make_scalar_function(
rtrim::<i32>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
DataType::LargeUtf8 => make_scalar_function(
rtrim::<i64>,
vec![Hint::Pad, Hint::AcceptsSingular],
)(args),
other => exec_err!("Unsupported data type {other:?} for function rtrim"),
other => exec_err!(
"Unsupported data type {other:?} for function rtrim,\
expected Utf8, LargeUtf8 or Utf8View."
),
}
}
}
128 changes: 93 additions & 35 deletions datafusion/sqllogictest/test_files/string_view.slt
Original file line number Diff line number Diff line change
Expand Up @@ -607,6 +607,99 @@ Xiangpeng Xiangpeng Xiangpeng NULL
Raphael Raphael Raphael NULL
NULL NULL NULL NULL

## Ensure no casts for LTRIM
# Test LTRIM with Utf8View input
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: ltrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test LTRIM outputs
query TTTTT
SELECT
LTRIM(column1_utf8view, 'foo') AS l1,
LTRIM(column1_utf8view, column2_utf8view) AS l2,
LTRIM(column1_utf8view) AS l3,
LTRIM(column1_utf8view, NULL) AS l4,
LTRIM(column1_utf8view, 'Xiang') AS l5
FROM test;
----
Andrew Andrew Andrew NULL Andrew
Xiangpeng (empty) Xiangpeng NULL peng
Raphael aphael Raphael NULL Raphael
NULL NULL NULL NULL NULL

## ensure no casts for RTRIM
# Test RTRIM with Utf8View input
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View input and Utf8View pattern
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'foo') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("foo")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM with Utf8View bytes longer than 12
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view, 'this is longer than 12') AS l
FROM test;
----
logical_plan
01)Projection: rtrim(test.column1_utf8view, Utf8View("this is longer than 12")) AS l
02)--TableScan: test projection=[column1_utf8view]

# Test RTRIM outputs
query TTTTT
SELECT
RTRIM(column1_utf8view, 'foo') AS l1,
RTRIM(column1_utf8view, column2_utf8view) AS l2,
RTRIM(column1_utf8view) AS l3,
RTRIM(column1_utf8view, NULL) AS l4,
RTRIM(column1_utf8view, 'peng') As l5
FROM test;
----
Andrew Andrew Andrew NULL Andrew
Xiangpeng (empty) Xiangpeng NULL Xia
Raphael Raphael Raphael NULL Raphael
NULL NULL NULL NULL NULL


## Ensure no casts for CHARACTER_LENGTH
query TT
EXPLAIN SELECT
Expand Down Expand Up @@ -685,16 +778,6 @@ logical_plan
01)Projection: lower(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LTRIM
## TODO https://github.com/apache/datafusion/issues/11856
query TT
EXPLAIN SELECT
LTRIM(column1_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: ltrim(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for LPAD
query TT
Expand Down Expand Up @@ -811,18 +894,6 @@ logical_plan
01)Projection: reverse(CAST(test.column1_utf8view AS Utf8)) AS c1
02)--TableScan: test projection=[column1_utf8view]

## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c1,
RTRIM(column1_utf8view, 'foo') as c2
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c1, rtrim(__common_expr_1, Utf8("foo")) AS c2
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1
03)----TableScan: test projection=[column1_utf8view]

## Ensure no casts for RIGHT
## TODO file ticket
Expand All @@ -849,19 +920,6 @@ logical_plan
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]


## Ensure no casts for RTRIM
## TODO file ticket
query TT
EXPLAIN SELECT
RTRIM(column1_utf8view) as c,
RTRIM(column1_utf8view, column2_utf8view) as c1
FROM test;
----
logical_plan
01)Projection: rtrim(__common_expr_1) AS c, rtrim(__common_expr_1, CAST(test.column2_utf8view AS Utf8)) AS c1
02)--Projection: CAST(test.column1_utf8view AS Utf8) AS __common_expr_1, test.column2_utf8view
03)----TableScan: test projection=[column1_utf8view, column2_utf8view]

## Ensure no casts for SPLIT_PART
## TODO file ticket
query TT
Expand Down

0 comments on commit e8ac93a

Please sign in to comment.