Skip to content

Commit

Permalink
[GLUTEN-7367][CH] Revert #7101 (#7368)
Browse files Browse the repository at this point in the history
* Revert "[GLUTEN-7096] [CH] fix exception when same names in group by (#7101)"

This reverts commit c8cfe11.

* Add UT for 'GLUTEN-7367: Memory limit exceeded'

* fix style
  • Loading branch information
baibaichen authored Sep 26, 2024
1 parent 2bac910 commit 028af4b
Show file tree
Hide file tree
Showing 10 changed files with 113 additions and 242 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import org.apache.gluten.extension.columnar.AddFallbackTagRule
import org.apache.gluten.extension.columnar.MiscColumnarRules.TransformPreOverrides
import org.apache.gluten.sql.shims.SparkShimLoader
import org.apache.gluten.substrait.expression.{ExpressionBuilder, ExpressionNode, WindowFunctionNode}
import org.apache.gluten.utils.{CHAggUtil, CHJoinValidateUtil, UnknownJoinStrategy}
import org.apache.gluten.utils.{CHJoinValidateUtil, UnknownJoinStrategy}
import org.apache.gluten.vectorized.CHColumnarBatchSerializer

import org.apache.spark.ShuffleDependency
Expand Down Expand Up @@ -160,7 +160,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi with Logging {
child: SparkPlan): HashAggregateExecBaseTransformer =
CHHashAggregateExecTransformer(
requiredChildDistributionExpressions,
CHAggUtil.distinctIgnoreQualifier(groupingExpressions),
groupingExpressions.distinct,
aggregateExpressions,
aggregateAttributes,
initialInputBufferOffset,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ import org.apache.gluten.substrait.{AggregationParams, SubstraitContext}
import org.apache.gluten.substrait.expression.{AggregateFunctionNode, ExpressionBuilder, ExpressionNode}
import org.apache.gluten.substrait.extensions.{AdvancedExtensionNode, ExtensionBuilder}
import org.apache.gluten.substrait.rel.{RelBuilder, RelNode}
import org.apache.gluten.utils.CHAggUtil

import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate._
Expand Down Expand Up @@ -430,15 +429,6 @@ case class CHHashAggregateExecPullOutHelper(
aggregateAttr.toList
}

override def allAggregateResultAttributes(
groupingExpressions: Seq[NamedExpression]): List[Attribute] = {
if (aggregateExpressions.nonEmpty) {
super.allAggregateResultAttributes(groupingExpressions)
} else {
super.allAggregateResultAttributes(CHAggUtil.distinctIgnoreQualifier(groupingExpressions))
}
}

protected def getAttrForAggregateExpr(
exp: AggregateExpression,
aggregateAttributeList: Seq[Attribute],
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
10000000157,132342342,124123,3123,22.334,1234.244,1434.242343,1,1,1,''ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOM','ATOM','ATOM',12,2013-3-31,2013-3-31,true,1
10000000157,132322342,14123,313,12.34,124.44,14.242343,2,7,1,'FT','FT','FT',2,2014-3-31,2012-3-21,true,1
10000000158,332342342,1241,31233,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2012-3-31,2012-3-31,true,1
10000000158,,,,,,,,,,,,,,,2012-3-21 10:10:10.789,,
10000000159,,,,,,,,,,,,,,,,,
10000000160,,,,,,,,,,,,,,,,,
10000000160,,,,,,,,,,,,,,,,,
10000000161,332342342,1241,31233,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2012-3-31,2012-3-31,true,999.99
10000000162,332342342,1241,31233,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2012-3-31,2012-3-31,true,999.99
10000000163,332342342,1241,31233,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2012-3-31,2012-3-31,true,999.99
10000000164,332342342,1241,10,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2014-4-1,2012-3-31,true,10.11
10000000165,332342342,1241,11,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2014-4-1,2012-3-31,true,10.12
10000000165,332342342,1241,11,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2014-4-2,2012-3-31,true,10.11
10000000166,332342342,1241,12,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2014-4-2,2012-3-31,true,10.11
10000000167,332342342,1241,17,29.334,123422.244,1434.24222343,5,11,3,'中国','中国','中国',12,2014-4-2,2012-3-31,true,10.13
10000000168,332342342,1241,17,29.334,123422.244,1434.24222343,5,11,3,Ch_na,'中国','中国',12,2014-4-2,2012-3-31,true,10.13
10000000169,332342342,1241,17,29.334,123422.244,1434.24222343,5,11,3,Ch%na,'中国','中国',12,2014-4-2,2012-3-31,true,10.13
10000000170,332342342,1241,17,29.334,123422.244,1434.24222343,5,11,3,China,'中国','中国',12,2014-4-2,2012-3-31,true,10.13
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
10000000157,132342342,124123,3123,22.334,1234.244,1434.242343,1,1,1,''ATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOMATOM','ATOM','ATOM',12,2013-3-31,2013-3-31,true
10000000158,,,,,,,,,,,,,,,,
10000000159,,,,,,,,,,,,,,,,
10000000160,,,,,,,,,,,,,,,,
Original file line number Diff line number Diff line change
Expand Up @@ -1301,6 +1301,95 @@ class GlutenClickHouseFileFormatSuite
_ => {}
)
}
test("GLUTEN-7367: Memory limit exceeded") {
val file_TEST_MEASURE = csvDataPath + "/default/TEST_MEASURE.csv"
val TEST_MEASURE = StructType.apply(
Seq(
StructField.apply("ID1", LongType, nullable = false),
StructField.apply("ID2", LongType, nullable = false),
StructField.apply("ID3", LongType, nullable = false),
StructField.apply("ID4", IntegerType, nullable = false),
StructField.apply("PRICE1", FloatType, nullable = false),
StructField.apply("PRICE2", DoubleType, nullable = false),
StructField.apply("PRICE3", DecimalType(19, 4), nullable = false),
StructField.apply("PRICE5", ShortType, nullable = false),
StructField.apply("PRICE6", ByteType, nullable = false),
StructField.apply("PRICE7", ShortType, nullable = false),
StructField.apply("NAME1", StringType, nullable = true),
StructField.apply("NAME2", StringType, nullable = true),
StructField.apply("NAME3", StringType, nullable = true),
StructField.apply("NAME4", ByteType, nullable = false),
StructField.apply("TIME1", DateType, nullable = false),
StructField.apply("TIME2", TimestampType, nullable = false),
StructField.apply("FLAG", BooleanType, nullable = false)
))
spark.read
.schema(TEST_MEASURE)
.csv(file_TEST_MEASURE)
.toDF()
.createTempView("TEST_MEASURE")
val file_TEST_MEASURE1 = csvDataPath + "/default/TEST_MEASURE1.csv"
val TEST_MEASURE1 = StructType.apply(
Seq(
StructField.apply("ID1", LongType, nullable = false),
StructField.apply("ID2", LongType, nullable = false),
StructField.apply("ID3", LongType, nullable = false),
StructField.apply("ID4", IntegerType, nullable = false),
StructField.apply("PRICE1", FloatType, nullable = false),
StructField.apply("PRICE2", DoubleType, nullable = false),
StructField.apply("PRICE3", DecimalType(19, 4), nullable = false),
StructField.apply("PRICE5", ShortType, nullable = false),
StructField.apply("PRICE6", ByteType, nullable = false),
StructField.apply("PRICE7", ShortType, nullable = false),
StructField.apply("NAME1", StringType, nullable = false),
StructField.apply("NAME2", StringType, nullable = false),
StructField.apply("NAME3", StringType, nullable = false),
StructField.apply("NAME4", ByteType, nullable = false),
StructField.apply("TIME1", DateType, nullable = false),
StructField.apply("TIME2", TimestampType, nullable = false),
StructField.apply("FLAG", BooleanType, nullable = false)
))
spark.read
.schema(TEST_MEASURE1)
.csv(file_TEST_MEASURE1)
.toDF()
.createTempView("TEST_MEASURE1")

withSQLConf(
(CHConf.runtimeSettings("use_excel_serialization"), "false"),
("spark.gluten.sql.text.input.empty.as.default", "true")) {
compareResultsAgainstVanillaSpark(
"""
| select * from TEST_MEASURE
|""".stripMargin,
compareResult = true,
_ => {}
)

compareResultsAgainstVanillaSpark(
"""
| select * from TEST_MEASURE1
|""".stripMargin,
compareResult = true,
_ => {}
)

val sqlStr =
"""select `TEST_MEASURE`.`ID1`,
| count(distinct `TEST_MEASURE`.`ID1`, `TEST_MEASURE`.`ID2`, `TEST_MEASURE`.`ID3`,
| `TEST_MEASURE`.`ID4`,`TEST_MEASURE`.`PRICE1`, `TEST_MEASURE`.`PRICE2`,
| `TEST_MEASURE`.`PRICE3`, `TEST_MEASURE`.`PRICE5`,`TEST_MEASURE`.`PRICE6`,
| `TEST_MEASURE`.`PRICE7`, `TEST_MEASURE`.`NAME1`, `TEST_MEASURE`.`NAME2`,
| `TEST_MEASURE`.`NAME3`, `TEST_MEASURE`.`NAME4`, `TEST_MEASURE`.`TIME1`,
| `TEST_MEASURE`.`TIME2`,`TEST_MEASURE`.`FLAG`),
| 1
|from `TEST_MEASURE`
| left join `TEST_MEASURE1` on `TEST_MEASURE`.`ID1` = `TEST_MEASURE1`.`ID1`
|group by `TEST_MEASURE`.`ID1`""".stripMargin

compareResultsAgainstVanillaSpark(sqlStr, compareResult = true, _ => {})
}
}

test("issues-3609 int read test") {
withSQLConf((CHConf.runtimeSettings("use_excel_serialization.number_force"), "false")) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,50 +122,4 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase {
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| )) group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names with different qualifier in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| ) t1 ) t2 group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -122,50 +122,4 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase {
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| )) group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names with different qualifier in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| ) t1 ) t2 group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -122,50 +122,4 @@ class GlutenHiveSQLQueryCHSuite extends GlutenHiveSQLQuerySuiteBase {
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| )) group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}

testGluten("GLUTEN-7096: Same names with different qualifier in group by may cause exception") {
sql("create table if not exists test_7096 (day string, rtime int, uid string, owner string)")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-01', 123, 'user1', 'owner1')")
sql("insert into test_7096 values ('2024-09-02', 567, 'user2', 'owner2')")
val query =
"""
|select days, rtime, uid, owner, day1
|from (
| select day1 as days, rtime, uid, owner, day1
| from (
| select distinct coalesce(day, "today") as day1, rtime, uid, owner
| from test_7096 where day = '2024-09-01'
| ) t1 ) t2 group by days, rtime, uid, owner, day1
|""".stripMargin
val df = sql(query)
checkAnswer(df, Seq(Row("2024-09-01", 123, "user1", "owner1", "2024-09-01")))
spark.sessionState.catalog.dropTable(
TableIdentifier("test_7096"),
ignoreIfNotExists = true,
purge = false)
}
}
Loading

0 comments on commit 028af4b

Please sign in to comment.