Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[SPARK-50582][SQL][PYTHON] Add quote builtin function #49191

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -2160,6 +2160,14 @@ public UTF8String soundex() {
return UTF8String.fromBytes(sx);
}

public UTF8String quote() {
final String qtChar = "'";
final String qtCharRep = "\\\\'";

String sp = toString().replaceAll(qtChar, qtCharRep);
return fromString(qtChar + sp + qtChar);
}

@Override
public void writeExternal(ObjectOutput out) throws IOException {
byte[] bytes = getBytes();
Expand Down
7 changes: 7 additions & 0 deletions python/pyspark/sql/connect/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -3040,6 +3040,13 @@ def collation(col: "ColumnOrName") -> Column:
collation.__doc__ = pysparkfuncs.collation.__doc__


def quote(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("quote", col)


quote.__doc__ = pysparkfuncs.quote.__doc__


# Date/Timestamp functions


Expand Down
31 changes: 31 additions & 0 deletions python/pyspark/sql/functions/builtin.py
Original file line number Diff line number Diff line change
Expand Up @@ -17100,6 +17100,37 @@ def collation(col: "ColumnOrName") -> Column:
return _invoke_function_over_columns("collation", col)


@_try_remote_functions
def quote(col: "ColumnOrName") -> Column:
r"""Returns `str` enclosed by single quotes and each instance of
single quote in it is preceded by a backslash.

.. versionadded:: 4.0.0

Parameters
----------
col : :class:`~pyspark.sql.Column` or column name
target column to be quoted.

Returns
-------
:class:`~pyspark.sql.Column`
quoted string

Examples
--------
>>> from pyspark.sql import functions as sf
>>> df = spark.createDataFrame(["Don't"], "STRING")
>>> df.select("*", sf.quote("value")).show()
+-----+------------+
|value|quote(value)|
+-----+------------+
|Don't| 'Don\'t'|
+-----+------------+
"""
return _invoke_function_over_columns("quote", col)


# ---------------------- Collection functions ------------------------------


Expand Down
9 changes: 9 additions & 0 deletions sql/api/src/main/scala/org/apache/spark/sql/functions.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5074,6 +5074,15 @@ object functions {
*/
def right(str: Column, len: Column): Column = Column.fn("right", str, len)

/**
* Returns `str` enclosed by single quotes and each instance of single quote in it is preceded
* by a backslash.
*
* @group string_funcs
* @since 4.0.0
*/
def quote(str: Column): Column = Column.fn("quote", str)

//////////////////////////////////////////////////////////////////////////////////////////////
// DateTime functions
//////////////////////////////////////////////////////////////////////////////////////////////
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,7 @@ object FunctionRegistry {
expression[MakeValidUTF8]("make_valid_utf8"),
expression[ValidateUTF8]("validate_utf8"),
expression[TryValidateUTF8]("try_validate_utf8"),
expression[Quote]("quote"),

// url functions
expression[UrlEncode]("url_encode"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3723,3 +3723,40 @@ case class Luhncheck(input: Expression) extends RuntimeReplaceable with Implicit
override protected def withNewChildrenInternal(
newChildren: IndexedSeq[Expression]): Expression = copy(newChildren(0))
}

/**
* A function that prepends a backslash to each instance of single quote
* in the given string and encloses the result by single quotes.
*/
@ExpressionDescription(
usage = """
_FUNC_(str) - Returns `str` enclosed by single quotes and
each instance of single quote in it is preceded by a backslash.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be better to leave one string because the new line character occurs in docs.

""",
examples = """
Examples:
> SELECT _FUNC_('Don\'t');
'Don\'t'
""",
since = "4.0.0",
group = "string_funcs")
case class Quote(input: Expression) extends RuntimeReplaceable with ImplicitCastInputTypes
with UnaryLike[Expression] {
override def nullIntolerant: Boolean = true

override lazy val replacement: Expression = Invoke(input, "quote", input.dataType)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can't you use StaticInvoke?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean for instance, implementing quote in ExpressionImplUtils and call it using StaticInvoke right?


override def inputTypes: Seq[AbstractDataType] = {
Seq(StringTypeWithCollation(supportsTrimCollation = true))
}

override def nodeName: String = "quote"

override def nullable: Boolean = true

override def child: Expression = input

override protected def withNewChildInternal(newChild: Expression): Quote = {
copy(input = newChild)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,8 @@
| org.apache.spark.sql.catalyst.expressions.If | if | SELECT if(1 < 2, 'a', 'b') | struct<(IF((1 < 2), a, b)):string> |
| org.apache.spark.sql.catalyst.expressions.In | in | SELECT 1 in(1, 2, 3) | struct<(1 IN (1, 2, 3)):boolean> |
| org.apache.spark.sql.catalyst.expressions.InitCap | initcap | SELECT initcap('sPark sql') | struct<initcap(sPark sql):string> |
| org.apache.spark.sql.catalyst.expressions.Inline | inline | SELECT inline(array(struct(1, 'a'), struct(2, 'b'))) | struct<col1:int,col2:string> |
| org.apache.spark.sql.catalyst.expressions.Inline | inline_outer | SELECT inline_outer(array(struct(1, 'a'), struct(2, 'b'))) | struct<col1:int,col2:string> |
| org.apache.spark.sql.catalyst.expressions.InlineExpressionBuilder | inline | SELECT inline(array(struct(1, 'a'), struct(2, 'b'))) | struct<col1:int,col2:string> |
| org.apache.spark.sql.catalyst.expressions.InlineExpressionBuilder | inline_outer | SELECT inline_outer(array(struct(1, 'a'), struct(2, 'b'))) | struct<col1:int,col2:string> |
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this related to this PR?

Copy link
Member Author

@sarutak sarutak Jan 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@dongjoon-hyun
Hmm, maybe #48503 should have re-generated the golden file right?
If so, I'll remove this irrelevant change manually, and open another PR for followup.
What do you think?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, I agree with you. Let's handle them independently.

| org.apache.spark.sql.catalyst.expressions.InputFileBlockLength | input_file_block_length | SELECT input_file_block_length() | struct<input_file_block_length():bigint> |
| org.apache.spark.sql.catalyst.expressions.InputFileBlockStart | input_file_block_start | SELECT input_file_block_start() | struct<input_file_block_start():bigint> |
| org.apache.spark.sql.catalyst.expressions.InputFileName | input_file_name | SELECT input_file_name() | struct<input_file_name():string> |
Expand Down Expand Up @@ -253,11 +253,12 @@
| org.apache.spark.sql.catalyst.expressions.PercentRank | percent_rank | SELECT a, b, percent_rank(b) OVER (PARTITION BY a ORDER BY b) FROM VALUES ('A1', 2), ('A1', 1), ('A2', 3), ('A1', 1) tab(a, b) | struct<a:string,b:int,PERCENT_RANK() OVER (PARTITION BY a ORDER BY b ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW):double> |
| org.apache.spark.sql.catalyst.expressions.Pi | pi | SELECT pi() | struct<PI():double> |
| org.apache.spark.sql.catalyst.expressions.Pmod | pmod | SELECT pmod(10, 3) | struct<pmod(10, 3):int> |
| org.apache.spark.sql.catalyst.expressions.PosExplode | posexplode | SELECT posexplode(array(10,20)) | struct<pos:int,col:int> |
| org.apache.spark.sql.catalyst.expressions.PosExplode | posexplode_outer | SELECT posexplode_outer(array(10,20)) | struct<pos:int,col:int> |
| org.apache.spark.sql.catalyst.expressions.PosExplodeExpressionBuilder | posexplode | SELECT posexplode(array(10,20)) | struct<pos:int,col:int> |
| org.apache.spark.sql.catalyst.expressions.PosExplodeExpressionBuilder | posexplode_outer | SELECT posexplode_outer(array(10,20)) | struct<pos:int,col:int> |
| org.apache.spark.sql.catalyst.expressions.Pow | pow | SELECT pow(2, 3) | struct<pow(2, 3):double> |
| org.apache.spark.sql.catalyst.expressions.Pow | power | SELECT power(2, 3) | struct<POWER(2, 3):double> |
| org.apache.spark.sql.catalyst.expressions.Quarter | quarter | SELECT quarter('2016-08-31') | struct<quarter(2016-08-31):int> |
| org.apache.spark.sql.catalyst.expressions.Quote | quote | SELECT quote('Don\'t') | struct<quote(Don't):string> |
| org.apache.spark.sql.catalyst.expressions.RLike | regexp | SELECT regexp('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
| org.apache.spark.sql.catalyst.expressions.RLike | regexp_like | SELECT regexp_like('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<REGEXP_LIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
| org.apache.spark.sql.catalyst.expressions.RLike | rlike | SELECT rlike('%SystemDrive%\Users\John', '%SystemDrive%\\Users.*') | struct<RLIKE(%SystemDrive%UsersJohn, %SystemDrive%\Users.*):boolean> |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1796,3 +1796,24 @@ select try_validate_utf8(x'80')
-- !query analysis
Project [try_validate_utf8(cast(0x80 as string)) AS try_validate_utf8(X'80')#x]
+- OneRowRelation


-- !query
select quote('Spark')
-- !query analysis
Project [quote(Spark) AS quote(Spark)#x]
+- OneRowRelation


-- !query
select quote("Don't")
-- !query analysis
Project [quote(Don't) AS quote(Don't)#x]
+- OneRowRelation


-- !query
select quote(NULL)
-- !query analysis
Project [quote(cast(null as string)) AS quote(NULL)#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -1796,3 +1796,24 @@ select try_validate_utf8(x'80')
-- !query analysis
Project [try_validate_utf8(cast(0x80 as string)) AS try_validate_utf8(X'80')#x]
+- OneRowRelation


-- !query
select quote('Spark')
-- !query analysis
Project [quote(Spark) AS quote(Spark)#x]
+- OneRowRelation


-- !query
select quote("Don't")
-- !query analysis
Project [quote(Don't) AS quote(Don't)#x]
+- OneRowRelation


-- !query
select quote(NULL)
-- !query analysis
Project [quote(cast(null as string)) AS quote(NULL)#x]
+- OneRowRelation
Original file line number Diff line number Diff line change
Expand Up @@ -306,3 +306,8 @@ select validate_utf8(x'80');
select try_validate_utf8('');
select try_validate_utf8('abc');
select try_validate_utf8(x'80');

-- quote
select quote('Spark');
select quote("Don't");
select quote(NULL);
Original file line number Diff line number Diff line change
Expand Up @@ -2277,3 +2277,27 @@ select try_validate_utf8(x'80')
struct<try_validate_utf8(X'80'):string>
-- !query output
NULL


-- !query
select quote('Spark')
-- !query schema
struct<quote(Spark):string>
-- !query output
'Spark'


-- !query
select quote("Don't")
-- !query schema
struct<quote(Don't):string>
-- !query output
'Don\'t'


-- !query
select quote(NULL)
-- !query schema
struct<quote(NULL):string>
-- !query output
NULL
Original file line number Diff line number Diff line change
Expand Up @@ -2341,3 +2341,27 @@ select try_validate_utf8(x'80')
struct<try_validate_utf8(X'80'):string>
-- !query output
NULL


-- !query
select quote('Spark')
-- !query schema
struct<quote(Spark):string>
-- !query output
'Spark'


-- !query
select quote("Don't")
-- !query schema
struct<quote(Don't):string>
-- !query output
'Don\'t'


-- !query
select quote(NULL)
-- !query schema
struct<quote(NULL):string>
-- !query output
NULL
Original file line number Diff line number Diff line change
Expand Up @@ -1452,4 +1452,21 @@ class StringFunctionsSuite extends QueryTest with SharedSparkSession {
Seq(Row("abc", "def")))
}
}

test("SPARK-50582: string quote function") {
val df = Seq(("Don't")).toDF("value")

checkAnswer(
df.select(quote($"value")),
Row("'Don\\'t'"))

checkAnswer(
df.selectExpr("quote('Spark')"),
Row("'Spark'")
)

checkAnswer(
df.selectExpr("quote(NULL)"),
Row(null))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is duplicates of checks in string-functions.sql?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I think string-functions.sql is enough.
How about just removing the duplicates here?

}
}
Loading