diff --git a/.github/workflows/clickhouse_be_trigger.yml b/.github/workflows/clickhouse_be_trigger.yml index a45dfc8a948d..fbc3eac86393 100644 --- a/.github/workflows/clickhouse_be_trigger.yml +++ b/.github/workflows/clickhouse_be_trigger.yml @@ -22,9 +22,8 @@ on: - '.github/workflows/clickhouse_be_trigger.yml' - 'pom.xml' - 'backends-clickhouse/**' - - 'gluten-celeborn/common/**' - - 'gluten-celeborn/package/**' - - 'gluten-celeborn/clickhouse/**' + - 'gluten-celeborn/**' + - 'gluten-iceberg/**' - 'gluten-core/**' - 'gluten-substrait/**' - 'gluten-ut/**' diff --git a/.github/workflows/velox_backend.yml b/.github/workflows/velox_backend.yml index 5feb5fa56855..2bba907bb0db 100644 --- a/.github/workflows/velox_backend.yml +++ b/.github/workflows/velox_backend.yml @@ -22,9 +22,7 @@ on: - 'pom.xml' - 'backends-velox/**' - 'gluten-uniffle/**' - - 'gluten-celeborn/common/**' - - 'gluten-celeborn/package/**' - - 'gluten-celeborn/velox/**' + - 'gluten-celeborn/**' - 'gluten-ras/**' - 'gluten-core/**' - 'gluten-substrait/**' diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml index 9a606c48e694..e34f571961d6 100644 --- a/backends-clickhouse/pom.xml +++ b/backends-clickhouse/pom.xml @@ -14,6 +14,35 @@ Gluten Backends ClickHouse + + celeborn + + false + + + + org.apache.gluten + gluten-celeborn + ${project.version} + + + org.apache.celeborn + celeborn-client-spark-${spark.major.version}-shaded_${scala.binary.version} + ${celeborn.version} + provided + + + org.apache.celeborn + celeborn-client-spark-${spark.major.version}_${scala.binary.version} + + + org.apache.celeborn + celeborn-spark-${spark.major.version}-columnar-shuffle_${scala.binary.version} + + + + + iceberg diff --git a/gluten-celeborn/clickhouse/src/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory b/backends-clickhouse/src-celeborn/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory similarity index 100% rename from gluten-celeborn/clickhouse/src/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory rename to backends-clickhouse/src-celeborn/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala b/backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala similarity index 100% rename from gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala rename to backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarBatchSerializer.scala diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala b/backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala similarity index 100% rename from gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala rename to backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriter.scala diff --git a/gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriterFactory.scala b/backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriterFactory.scala similarity index 100% rename from gluten-celeborn/clickhouse/src/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriterFactory.scala rename to backends-clickhouse/src-celeborn/main/scala/org/apache/spark/shuffle/CHCelebornColumnarShuffleWriterFactory.scala diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q01.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q01.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q01.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q01.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q02.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q02.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q02.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q02.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q03.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q03.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q03.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q03.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q04.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q04.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q04.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q04.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q05.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q05.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q05.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q05.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q06.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q06.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q06.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q06.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q07.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q07.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q07.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q07.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q08.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q08.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q08.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q08.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q09.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q09.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q09.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q09.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q10.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q10.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q10.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q10.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q11.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q11.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q11.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q11.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q12.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q12.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q12.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q12.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q13.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q13.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q13.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q13.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q14.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q14.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q14.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q14.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q15.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q15.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q15.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q15.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q16.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q16.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q16.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q16.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q17.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q17.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q17.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q17.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q18.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q18.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q18.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q18.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q19.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q19.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q19.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q19.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q20.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q20.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q20.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q20.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q21.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q21.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q21.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q21.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries-output/q22.out b/backends-clickhouse/src-celeborn/test/resources/queries-output/q22.out similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries-output/q22.out rename to backends-clickhouse/src-celeborn/test/resources/queries-output/q22.out diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q1.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q1.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q1.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q1.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q10.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q10.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q10.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q10.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q11.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q11.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q11.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q11.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q12.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q12.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q12.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q12.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q13.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q13.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q13.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q13.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q14.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q14.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q14.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q14.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q15.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q15.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q15.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q15.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q16.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q16.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q16.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q16.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q17.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q17.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q17.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q17.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q18.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q18.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q18.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q18.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q19.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q19.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q19.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q19.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q2.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q2.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q2.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q2.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q20.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q20.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q20.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q20.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q21.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q21.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q21.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q21.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q22.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q22.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q22.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q22.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q3.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q3.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q3.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q3.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q4.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q4.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q4.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q4.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q5.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q5.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q5.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q5.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q6.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q6.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q6.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q6.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q7.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q7.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q7.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q7.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q8.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q8.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q8.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q8.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q9.sql b/backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q9.sql similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/queries/tpch-queries-ch/q9.sql rename to backends-clickhouse/src-celeborn/test/resources/queries/tpch-queries-ch/q9.sql diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/data.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/data.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/data.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/data.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/data.mrk3 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/data.mrk3 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/data.mrk3 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/data.mrk3 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/customer/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/customer/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_comment.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_commitdate.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_discount.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_extendedprice.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linenumber.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_linestatus.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_orderkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_partkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_quantity.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_receiptdate.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_returnflag.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipdate.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipinstruct.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_shipmode.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_suppkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/lineitem/all_1_1_0/l_tax.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/data.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/data.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/data.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/data.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/data.mrk3 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/data.mrk3 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/data.mrk3 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/data.mrk3 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/nation/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/nation/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_clerk.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_comment.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_custkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderdate.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderpriority.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_orderstatus.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_shippriority.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/orders/all_1_1_0/o_totalprice.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/data.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/data.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/data.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/data.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/data.mrk3 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/data.mrk3 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/data.mrk3 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/data.mrk3 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/part/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/part/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_availqty.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_comment.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_partkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_suppkey.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.mrk2 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.mrk2 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.mrk2 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/partsupp/all_1_1_0/ps_supplycost.mrk2 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/data.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/data.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/data.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/data.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/data.mrk3 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/data.mrk3 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/data.mrk3 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/data.mrk3 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/region/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/region/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/checksums.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/checksums.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/checksums.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/checksums.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/columns.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/columns.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/columns.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/columns.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/count.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/count.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/count.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/count.txt diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/data.bin b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/data.bin similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/data.bin rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/data.bin diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/data.mrk3 b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/data.mrk3 similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/data.mrk3 rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/data.mrk3 diff --git a/gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/default_compression_codec.txt b/backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/default_compression_codec.txt similarity index 100% rename from gluten-celeborn/clickhouse/src/test/resources/tpch-data-ch/supplier/all_1_1_0/default_compression_codec.txt rename to backends-clickhouse/src-celeborn/test/resources/tpch-data-ch/supplier/all_1_1_0/default_compression_codec.txt diff --git a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala b/backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala similarity index 95% rename from gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala rename to backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala index ee7657c505ac..10350898cf88 100644 --- a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala +++ b/backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarMemorySortShuffleSuite.scala @@ -26,10 +26,10 @@ class GlutenClickHouseRSSColumnarMemorySortShuffleSuite override protected val tablesPath: String = basePath + "/tpch-data-ch" override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" override protected val queriesResults: String = - rootPath + "../../../../../backends-clickhouse/src/test/resources/mergetree-queries-output" + rootPath + "../../../../backends-clickhouse/src/test/resources/mergetree-queries-output" override protected val parquetTableDataPath: String = - "../../../../../gluten-core/src/test/resources/tpch-data" + "../../../../gluten-core/src/test/resources/tpch-data" /** Run Gluten + ClickHouse Backend with ColumnarShuffleManager */ override protected def sparkConf: SparkConf = { diff --git a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala b/backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala similarity index 97% rename from gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala rename to backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala index e62dbdd2a5fe..4c62ee73f0f7 100644 --- a/gluten-celeborn/clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala +++ b/backends-clickhouse/src-celeborn/test/scala/org/apache/gluten/execution/GlutenClickHouseRSSColumnarShuffleAQESuite.scala @@ -30,10 +30,10 @@ class GlutenClickHouseRSSColumnarShuffleAQESuite override protected val tablesPath: String = basePath + "/tpch-data-ch" override protected val tpchQueries: String = rootPath + "queries/tpch-queries-ch" override protected val queriesResults: String = - rootPath + "../../../../../backends-clickhouse/src/test/resources/mergetree-queries-output" + rootPath + "../../../../backends-clickhouse/src/test/resources/mergetree-queries-output" override protected val parquetTableDataPath: String = - "../../../../../gluten-core/src/test/resources/tpch-data" + "../../../../gluten-core/src/test/resources/tpch-data" /** Run Gluten + ClickHouse Backend with ColumnarShuffleManager */ override protected def sparkConf: SparkConf = { diff --git a/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala index 05f7fdbfa423..cd3ce793747c 100644 --- a/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala +++ b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/delta/ClickhouseOptimisticTransaction.scala @@ -29,9 +29,8 @@ import org.apache.spark.sql.delta.files._ import org.apache.spark.sql.delta.hooks.AutoCompact import org.apache.spark.sql.delta.schema.{InnerInvariantViolationException, InvariantViolationException} import org.apache.spark.sql.delta.sources.DeltaSQLConf -import org.apache.spark.sql.execution.{QueryExecution, SparkPlan, SQLExecution} -import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanExec -import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, FileFormatWriter, GlutenWriterColumnarRules, WriteFiles, WriteJobStatsTracker} +import org.apache.spark.sql.execution.SQLExecution +import org.apache.spark.sql.execution.datasources.{BasicWriteJobStatsTracker, DeltaV1Writes, FileFormatWriter, GlutenWriterColumnarRules, WriteJobStatsTracker} import org.apache.spark.sql.execution.datasources.v1.MergeTreeWriterInjects import org.apache.spark.sql.execution.datasources.v1.clickhouse.MergeTreeFileFormatWriter import org.apache.spark.sql.execution.datasources.v2.clickhouse.ClickHouseConfig @@ -229,31 +228,12 @@ class ClickhouseOptimisticTransaction( val (data, partitionSchema) = performCDCPartition(inputData) val outputPath = deltaLog.dataPath - val fileFormat = deltaLog.fileFormat(protocol, metadata) // TODO support changing formats. - - // Iceberg spec requires partition columns in data files - val writePartitionColumns = IcebergCompat.isAnyEnabled(metadata) - // Retain only a minimal selection of Spark writer options to avoid any potential - // compatibility issues - val options = (writeOptions match { - case None => Map.empty[String, String] - case Some(writeOptions) => - writeOptions.options.filterKeys { - key => - key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || - key.equalsIgnoreCase(DeltaOptions.COMPRESSION) - }.toMap - }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString) - - val (normalQueryExecution, output, generatedColumnConstraints, _) = + val (queryExecution, output, generatedColumnConstraints, _) = normalizeData(deltaLog, writeOptions, data) val partitioningColumns = getPartitioningColumns(partitionSchema, output) - val logicalPlan = normalQueryExecution.optimizedPlan - val write = - WriteFiles(logicalPlan, fileFormat, partitioningColumns, None, options, Map.empty) + val fileFormat = deltaLog.fileFormat(protocol, metadata) // TODO support changing formats. - val queryExecution = new QueryExecution(spark, write) val (committer, collectStats) = fileFormat.toString match { case "MergeTree" => (getCommitter2(outputPath), false) case _ => (getCommitter(outputPath), true) @@ -274,20 +254,24 @@ class ClickhouseOptimisticTransaction( SQLExecution.withNewExecutionId(queryExecution, Option("deltaTransactionalWrite")) { val outputSpec = FileFormatWriter.OutputSpec(outputPath.toString, Map.empty, output) - val physicalPlan = materializeAdaptiveSparkPlan(queryExecution.executedPlan) - // convertEmptyToNullIfNeeded(queryExecution.executedPlan, partitioningColumns, constraints) - /* val checkInvariants = DeltaInvariantCheckerExec(empty2NullPlan, constraints) + val empty2NullPlan = + convertEmptyToNullIfNeeded(queryExecution.sparkPlan, partitioningColumns, constraints) + // TODO: val checkInvariants = DeltaInvariantCheckerExec(empty2NullPlan, constraints) + val checkInvariants = empty2NullPlan + // No need to plan optimized write if the write command is OPTIMIZE, which aims to produce // evenly-balanced data files already. - val physicalPlan = - if ( - !isOptimize && - shouldOptimizeWrite(writeOptions, spark.sessionState.conf) - ) { - DeltaOptimizedWriterExec(checkInvariants, metadata.partitionColumns, deltaLog) - } else { - checkInvariants - } */ + // TODO: val physicalPlan = + // if ( + // !isOptimize && + // shouldOptimizeWrite(writeOptions, spark.sessionState.conf) + // ) { + // DeltaOptimizedWriterExec(checkInvariants, metadata.partitionColumns, deltaLog) + // } else { + // checkInvariants + // } + val physicalPlan = checkInvariants + val statsTrackers: ListBuffer[WriteJobStatsTracker] = ListBuffer() if (spark.conf.get(DeltaSQLConf.DELTA_HISTORY_METRICS_ENABLED)) { @@ -298,10 +282,33 @@ class ClickhouseOptimisticTransaction( statsTrackers.append(basicWriteJobStatsTracker) } + // Iceberg spec requires partition columns in data files + val writePartitionColumns = IcebergCompat.isAnyEnabled(metadata) + // Retain only a minimal selection of Spark writer options to avoid any potential + // compatibility issues + val options = (writeOptions match { + case None => Map.empty[String, String] + case Some(writeOptions) => + writeOptions.options.filterKeys { + key => + key.equalsIgnoreCase(DeltaOptions.MAX_RECORDS_PER_FILE) || + key.equalsIgnoreCase(DeltaOptions.COMPRESSION) + }.toMap + }) + (DeltaOptions.WRITE_PARTITION_COLUMNS -> writePartitionColumns.toString) + + val executedPlan = DeltaV1Writes( + spark, + physicalPlan, + fileFormat, + partitioningColumns, + None, + options + ).executedPlan + try { DeltaFileFormatWriter.write( sparkSession = spark, - plan = physicalPlan, + plan = executedPlan, fileFormat = fileFormat, committer = committer, outputSpec = outputSpec, @@ -358,8 +365,4 @@ class ClickhouseOptimisticTransaction( resultFiles.toSeq ++ committer.changeFiles } - private def materializeAdaptiveSparkPlan(plan: SparkPlan): SparkPlan = plan match { - case a: AdaptiveSparkPlanExec => a.finalPhysicalPlan - case p: SparkPlan => p - } } diff --git a/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/FileDeltaColumnarWrite.scala b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/FileDeltaColumnarWrite.scala index bf6b0c0074dc..df7ef7e23409 100644 --- a/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/FileDeltaColumnarWrite.scala +++ b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/FileDeltaColumnarWrite.scala @@ -137,7 +137,8 @@ case class FileDeltaColumnarWrite( // stats.map(row => x.apply(row).getString(0)).foreach(println) // process stats val commitInfo = DeltaFileCommitInfo(committer) - val basicNativeStat = NativeBasicWriteTaskStatsTracker(description, basicWriteJobStatsTracker) + val basicNativeStat = + NativeBasicWriteTaskStatsTracker(description.path, basicWriteJobStatsTracker) val basicNativeStats = Seq(commitInfo, basicNativeStat) NativeStatCompute(stats)(basicNativeStats, nativeDeltaStats) diff --git a/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala new file mode 100644 index 000000000000..8ae99cc0d59f --- /dev/null +++ b/backends-clickhouse/src-delta-32/main/scala/org/apache/spark/sql/execution/datasources/DeltaV1Writes.scala @@ -0,0 +1,74 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources +import org.apache.gluten.backendsapi.BackendsApiManager + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.catalog.BucketSpec +import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec +import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} +import org.apache.spark.sql.catalyst.plans.logical.LocalRelation +import org.apache.spark.sql.execution.{QueryExecution, SortExec, SparkPlan} +import org.apache.spark.sql.execution.datasources.V1WritesUtils.isOrderingMatched + +case class DeltaV1Writes( + spark: SparkSession, + query: SparkPlan, + fileFormat: FileFormat, + partitionColumns: Seq[Attribute], + bucketSpec: Option[BucketSpec], + options: Map[String, String], + staticPartitions: TablePartitionSpec = Map.empty) { + + require(fileFormat != null, "FileFormat is required to write files.") + require(BackendsApiManager.getSettings.enableNativeWriteFiles()) + + private lazy val requiredOrdering: Seq[SortOrder] = + V1WritesUtils.getSortOrder( + query.output, + partitionColumns, + bucketSpec, + options, + staticPartitions.size) + + lazy val sortPlan: SparkPlan = { + val outputOrdering = query.outputOrdering + val orderingMatched = isOrderingMatched(requiredOrdering.map(_.child), outputOrdering) + if (orderingMatched) { + query + } else { + SortExec(requiredOrdering, global = false, query) + } + } + + lazy val writePlan: SparkPlan = + WriteFilesExec( + sortPlan, + fileFormat = fileFormat, + partitionColumns = partitionColumns, + bucketSpec = bucketSpec, + options = options, + staticPartitions = staticPartitions) + + lazy val executedPlan: SparkPlan = + CallTransformer(spark, writePlan).executedPlan +} + +case class CallTransformer(spark: SparkSession, physicalPlan: SparkPlan) + extends QueryExecution(spark, LocalRelation()) { + override lazy val sparkPlan: SparkPlan = physicalPlan +} diff --git a/backends-clickhouse/src-delta-32/test/scala/org/apache/spark/sql/execution/datasources/DeltaV1WritesSuite.scala b/backends-clickhouse/src-delta-32/test/scala/org/apache/spark/sql/execution/datasources/DeltaV1WritesSuite.scala new file mode 100644 index 000000000000..1a90148df29e --- /dev/null +++ b/backends-clickhouse/src-delta-32/test/scala/org/apache/spark/sql/execution/datasources/DeltaV1WritesSuite.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution.datasources + +import org.apache.gluten.GlutenConfig +import org.apache.gluten.execution.{GlutenClickHouseWholeStageTransformerSuite, GlutenPlan, SortExecTransformer} +import org.apache.spark.SparkConf +import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat +import org.apache.spark.sql.execution.{SortExec, SparkPlan} + +class DeltaV1WritesSuite extends GlutenClickHouseWholeStageTransformerSuite { + + import testImplicits._ + + override protected def sparkConf: SparkConf = { + super.sparkConf + .set(GlutenConfig.NATIVE_WRITER_ENABLED.key, "true") + } + + override def beforeAll(): Unit = { + super.beforeAll() + (0 to 20) + .map(i => (i, i % 5, (i % 10).toString)) + .toDF("i", "j", "k") + .write + .saveAsTable("t0") + } + + override def afterAll(): Unit = { + sql("drop table if exists t0") + super.afterAll() + } + + val format = new ParquetFileFormat + def getSort(child: SparkPlan): Option[SortExecTransformer] = { + child.collectFirst { case w: SortExecTransformer => w } + } + test("don't add sort when the required ordering is empty") { + val df = sql("select * from t0") + val plan = df.queryExecution.sparkPlan + val writes = DeltaV1Writes(spark, plan, format, Nil, None, Map.empty) + assert(writes.sortPlan === plan) + assert(writes.writePlan != null) + assert(writes.executedPlan.isInstanceOf[GlutenPlan]) + val writeFilesOpt = V1WritesUtils.getWriteFilesOpt(writes.executedPlan) + assert(writeFilesOpt.isDefined) + val sortExec = getSort(writes.executedPlan) + assert(sortExec.isEmpty) + } + + test("don't add sort when the required ordering is already satisfied") { + val df = sql("select * from t0") + def check(plan: SparkPlan): Unit = { + val partitionColumns = plan.output.find(_.name == "k").toSeq + val writes = DeltaV1Writes(spark, plan, format, partitionColumns, None, Map.empty) + assert(writes.sortPlan === plan) + assert(writes.writePlan != null) + assert(writes.executedPlan.isInstanceOf[GlutenPlan]) + val writeFilesOpt = V1WritesUtils.getWriteFilesOpt(writes.executedPlan) + assert(writeFilesOpt.isDefined) + val sortExec = getSort(writes.executedPlan) + assert(sortExec.isDefined) + } + check(df.orderBy("k").queryExecution.sparkPlan) + check(df.orderBy("k", "j").queryExecution.sparkPlan) + } + + test("add sort when the required ordering is not satisfied") { + val df = sql("select * from t0") + def check(plan: SparkPlan): Unit = { + val partitionColumns = plan.output.find(_.name == "k").toSeq + val writes = DeltaV1Writes(spark, plan, format, partitionColumns, None, Map.empty) + val sort = writes.sortPlan.asInstanceOf[SortExec] + assert(sort.child === plan) + assert(writes.writePlan != null) + assert(writes.executedPlan.isInstanceOf[GlutenPlan]) + val writeFilesOpt = V1WritesUtils.getWriteFilesOpt(writes.executedPlan) + assert(writeFilesOpt.isDefined) + val sortExec = getSort(writes.executedPlan) + assert(sortExec.isDefined, s"writes.executedPlan: ${writes.executedPlan}") + } + check(df.queryExecution.sparkPlan) + check(df.orderBy("j", "k").queryExecution.sparkPlan) + } + +} diff --git a/backends-clickhouse/src/main/resources/org/apache/spark/sql/execution/datasources/v1/write_optimization.proto b/backends-clickhouse/src/main/resources/org/apache/spark/sql/execution/datasources/v1/write_optimization.proto index 89f606e4ffd3..fdf34f1a0a75 100644 --- a/backends-clickhouse/src/main/resources/org/apache/spark/sql/execution/datasources/v1/write_optimization.proto +++ b/backends-clickhouse/src/main/resources/org/apache/spark/sql/execution/datasources/v1/write_optimization.proto @@ -12,6 +12,9 @@ message Write { message Common { string format = 1; string job_task_attempt_id = 2; // currently used in mergetree format + + // Describes the partition index in the WriteRel.table_schema. + repeated int32 partition_col_index = 3; } message ParquetWrite{} message OrcWrite{} diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala index 40e53536184c..32961c21a266 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHRuleApi.scala @@ -93,7 +93,6 @@ object CHRuleApi { // Legacy: Post-transform rules. injector.injectPostTransform(_ => PruneNestedColumnsInHiveTableScan) - injector.injectPostTransform(_ => RemoveNativeWriteFilesSortAndProject()) injector.injectPostTransform(c => intercept(RewriteTransformer.apply(c.session))) injector.injectPostTransform(_ => PushDownFilterToScan) injector.injectPostTransform(_ => PushDownInputFileExpression.PostOffload) diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala index 0be8cf2c25bf..ef5a4eff6fca 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/CHTransformerApi.scala @@ -17,7 +17,7 @@ package org.apache.gluten.backendsapi.clickhouse import org.apache.gluten.backendsapi.TransformerApi -import org.apache.gluten.execution.CHHashAggregateExecTransformer +import org.apache.gluten.execution.{CHHashAggregateExecTransformer, WriteFilesExecTransformer} import org.apache.gluten.expression.ConverterUtils import org.apache.gluten.substrait.expression.{BooleanLiteralNode, ExpressionBuilder, ExpressionNode} import org.apache.gluten.utils.{CHInputPartitionsUtil, ExpressionDocUtil} @@ -31,7 +31,7 @@ import org.apache.spark.sql.delta.catalog.ClickHouseTableV2 import org.apache.spark.sql.delta.files.TahoeFileIndex import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.aggregate.HashAggregateExec -import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} import org.apache.spark.sql.execution.datasources.orc.OrcFileFormat import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat import org.apache.spark.sql.execution.datasources.v1.Write @@ -243,24 +243,31 @@ class CHTransformerApi extends TransformerApi with Logging { GlutenDriverEndpoint.invalidateResourceRelation(executionId) } - override def genWriteParameters( - fileFormat: FileFormat, - writeOptions: Map[String, String]): Any = { - val fileFormatStr = fileFormat match { + override def genWriteParameters(writeExec: WriteFilesExecTransformer): Any = { + val fileFormatStr = writeExec.fileFormat match { case register: DataSourceRegister => register.shortName case _ => "UnknownFileFormat" } - val write = Write + val childOutput = writeExec.child.output + + val partitionIndexes = + writeExec.partitionColumns.map(p => childOutput.indexWhere(_.exprId == p.exprId)) + require(partitionIndexes.forall(_ >= 0)) + + val common = Write.Common .newBuilder() - .setCommon( - Write.Common - .newBuilder() - .setFormat(fileFormatStr) - .setJobTaskAttemptId("") // we can get job and task id at the driver side - .build()) + .setFormat(s"$fileFormatStr") + .setJobTaskAttemptId("") // we cannot get job and task id at the driver side) + partitionIndexes.foreach { + idx => + require(idx >= 0) + common.addPartitionColIndex(idx) + } + + val write = Write.newBuilder().setCommon(common.build()) - fileFormat match { + writeExec.fileFormat match { case d: MergeTreeFileFormat => write.setMergetree(MergeTreeFileFormat.createWrite(d.metadata)) case _: ParquetFileFormat => @@ -273,5 +280,5 @@ class CHTransformerApi extends TransformerApi with Logging { /** use Hadoop Path class to encode the file path */ override def encodeFilePathIfNeed(filePath: String): String = - (new Path(filePath)).toUri.toASCIIString + new Path(filePath).toUri.toASCIIString } diff --git a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/RuntimeConfig.scala b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/RuntimeConfig.scala index 12bb8d05d953..055c3b9d87b8 100644 --- a/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/RuntimeConfig.scala +++ b/backends-clickhouse/src/main/scala/org/apache/gluten/backendsapi/clickhouse/RuntimeConfig.scala @@ -22,6 +22,7 @@ object RuntimeConfig { import CHConf._ import SQLConf._ + /** Clickhouse Configuration */ val PATH = buildConf(runtimeConfig("path")) .doc( @@ -37,9 +38,25 @@ object RuntimeConfig { .createWithDefault("/tmp/libch") // scalastyle:on line.size.limit + // scalastyle:off line.size.limit + val LOGGER_LEVEL = + buildConf(runtimeConfig("logger.level")) + .doc( + "https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings#logger") + .stringConf + .createWithDefault("warning") + // scalastyle:on line.size.limit + + /** Gluten Configuration */ val USE_CURRENT_DIRECTORY_AS_TMP = buildConf(runtimeConfig("use_current_directory_as_tmp")) .doc("Use the current directory as the temporary directory.") .booleanConf .createWithDefault(false) + + val DUMP_PIPELINE = + buildConf(runtimeConfig("dump_pipeline")) + .doc("Dump pipeline to file after execution") + .booleanConf + .createWithDefault(false) } diff --git a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala index 1342e250430e..427db0aad2b5 100644 --- a/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala +++ b/backends-clickhouse/src/main/scala/org/apache/spark/sql/execution/CHColumnarWrite.scala @@ -198,12 +198,12 @@ case class NativeStatCompute(rows: Seq[InternalRow]) { } case class NativeBasicWriteTaskStatsTracker( - description: WriteJobDescription, + writeDir: String, basicWriteJobStatsTracker: WriteTaskStatsTracker) extends (NativeFileWriteResult => Unit) { private var numWrittenRows: Long = 0 override def apply(stat: NativeFileWriteResult): Unit = { - val absolutePath = s"${description.path}/${stat.relativePath}" + val absolutePath = s"$writeDir/${stat.relativePath}" if (stat.partition_id != "__NO_PARTITION_ID__") { basicWriteJobStatsTracker.newPartition(new GenericInternalRow(Array[Any](stat.partition_id))) } @@ -248,6 +248,8 @@ case class HadoopMapReduceCommitProtocolWrite( extends CHColumnarWrite[HadoopMapReduceCommitProtocol] with Logging { + private var stageDir: String = _ + private lazy val adapter: HadoopMapReduceAdapter = HadoopMapReduceAdapter(committer) /** @@ -257,11 +259,12 @@ case class HadoopMapReduceCommitProtocolWrite( override def doSetupNativeTask(): Unit = { val (writePath, writeFilePattern) = adapter.getTaskAttemptTempPathAndFilePattern(taskAttemptContext, description) - logDebug(s"Native staging write path: $writePath and file pattern: $writeFilePattern") + stageDir = writePath + logDebug(s"Native staging write path: $stageDir and file pattern: $writeFilePattern") val settings = Map( - RuntimeSettings.TASK_WRITE_TMP_DIR.key -> writePath, + RuntimeSettings.TASK_WRITE_TMP_DIR.key -> stageDir, RuntimeSettings.TASK_WRITE_FILENAME_PATTERN.key -> writeFilePattern) NativeExpressionEvaluator.updateQueryRuntimeSettings(settings) } @@ -272,7 +275,7 @@ case class HadoopMapReduceCommitProtocolWrite( None } else { val commitInfo = FileCommitInfo(description) - val basicNativeStat = NativeBasicWriteTaskStatsTracker(description, basicWriteJobStatsTracker) + val basicNativeStat = NativeBasicWriteTaskStatsTracker(stageDir, basicWriteJobStatsTracker) val basicNativeStats = Seq(commitInfo, basicNativeStat) NativeStatCompute(stats)(basicNativeStats) val (partitions, addedAbsPathFiles) = commitInfo.result diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala index 2f55510a7b1f..3736f0f14415 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseDeltaParquetWriteSuite.scala @@ -1025,6 +1025,7 @@ class GlutenClickHouseDeltaParquetWriteSuite } } + // FIXME: optimize testSparkVersionLE33("test parquet optimize with the path based table") { val dataPath = s"$basePath/lineitem_delta_parquet_optimize_path_based" clearDataPath(dataPath) diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/mergetree/GlutenClickHouseMergeTreeWriteSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/mergetree/GlutenClickHouseMergeTreeWriteSuite.scala index cc577609656b..60ca58d9fc29 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/mergetree/GlutenClickHouseMergeTreeWriteSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/mergetree/GlutenClickHouseMergeTreeWriteSuite.scala @@ -57,6 +57,7 @@ class GlutenClickHouseMergeTreeWriteSuite .set("spark.sql.adaptive.enabled", "true") .set("spark.sql.files.maxPartitionBytes", "20000000") .set(GlutenConfig.NATIVE_WRITER_ENABLED.key, "true") + .set(CHConf.ENABLE_ONEPIPELINE_MERGETREE_WRITE.key, spark35.toString) .setCHSettings("min_insert_block_size_rows", 100000) .setCHSettings("mergetree.merge_after_insert", false) .setCHSettings("input_format_parquet_max_block_size", 8192) @@ -67,178 +68,172 @@ class GlutenClickHouseMergeTreeWriteSuite } test("test mergetree table write") { - withSQLConf((CHConf.ENABLE_ONEPIPELINE_MERGETREE_WRITE.key, spark35.toString)) { - spark.sql(s""" - |DROP TABLE IF EXISTS lineitem_mergetree; - |""".stripMargin) + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_mergetree; + |""".stripMargin) - // write.format.default = mergetree - spark.sql(s""" - |CREATE TABLE IF NOT EXISTS lineitem_mergetree - |( - | l_orderkey bigint, - | l_partkey bigint, - | l_suppkey bigint, - | l_linenumber bigint, - | l_quantity double, - | l_extendedprice double, - | l_discount double, - | l_tax double, - | l_returnflag string, - | l_linestatus string, - | l_shipdate date, - | l_commitdate date, - | l_receiptdate date, - | l_shipinstruct string, - | l_shipmode string, - | l_comment string - |) - |USING clickhouse - |TBLPROPERTIES (write.format.default = 'mergetree') - |LOCATION '$basePath/lineitem_mergetree' - |""".stripMargin) + // write.format.default = mergetree + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_mergetree + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING clickhouse + |TBLPROPERTIES (write.format.default = 'mergetree') + |LOCATION '$basePath/lineitem_mergetree' + |""".stripMargin) - spark.sql(s""" - | insert into table lineitem_mergetree - | select * from lineitem - |""".stripMargin) + spark.sql(s""" + | insert into table lineitem_mergetree + | select * from lineitem + |""".stripMargin) - runTPCHQueryBySQL(1, q1("lineitem_mergetree")) { - df => - val plans = collect(df.queryExecution.executedPlan) { - case f: FileSourceScanExecTransformer => f - case w: WholeStageTransformer => w - } - assertResult(4)(plans.size) + runTPCHQueryBySQL(1, q1("lineitem_mergetree")) { + df => + val plans = collect(df.queryExecution.executedPlan) { + case f: FileSourceScanExecTransformer => f + case w: WholeStageTransformer => w + } + assertResult(4)(plans.size) - val mergetreeScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] - assert(mergetreeScan.nodeName.startsWith("ScanTransformer mergetree")) + val mergetreeScan = plans(3).asInstanceOf[FileSourceScanExecTransformer] + assert(mergetreeScan.nodeName.startsWith("ScanTransformer mergetree")) - val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).orderByKeyOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) - assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) - val addFiles = - fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) - assertResult(6)(addFiles.size) - assertResult(600572)(addFiles.map(_.rows).sum) + val fileIndex = mergetreeScan.relation.location.asInstanceOf[TahoeFileIndex] + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).clickhouseTableConfigs.nonEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).bucketOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).orderByKeyOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).primaryKeyOption.isEmpty) + assert(ClickHouseTableV2.getTable(fileIndex.deltaLog).partitionColumns.isEmpty) + val addFiles = + fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) + assertResult(6)(addFiles.size) + assertResult(600572)(addFiles.map(_.rows).sum) - // GLUTEN-5060: check the unnecessary FilterExec - val wholeStageTransformer = plans(2).asInstanceOf[WholeStageTransformer] - val planNodeJson = wholeStageTransformer.substraitPlanJson - assert( - !planNodeJson - .replaceAll("\n", "") - .replaceAll(" ", "") - .contains("\"input\":{\"filter\":{")) - } + // GLUTEN-5060: check the unnecessary FilterExec + val wholeStageTransformer = plans(2).asInstanceOf[WholeStageTransformer] + val planNodeJson = wholeStageTransformer.substraitPlanJson + assert( + !planNodeJson + .replaceAll("\n", "") + .replaceAll(" ", "") + .contains("\"input\":{\"filter\":{")) } } test("test mergetree insert overwrite") { - withSQLConf((CHConf.ENABLE_ONEPIPELINE_MERGETREE_WRITE.key, spark35.toString)) { - spark.sql(s""" - |DROP TABLE IF EXISTS lineitem_mergetree_insertoverwrite; - |""".stripMargin) + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_mergetree_insertoverwrite; + |""".stripMargin) - spark.sql(s""" - |CREATE TABLE IF NOT EXISTS lineitem_mergetree_insertoverwrite - |( - | l_orderkey bigint, - | l_partkey bigint, - | l_suppkey bigint, - | l_linenumber bigint, - | l_quantity double, - | l_extendedprice double, - | l_discount double, - | l_tax double, - | l_returnflag string, - | l_linestatus string, - | l_shipdate date, - | l_commitdate date, - | l_receiptdate date, - | l_shipinstruct string, - | l_shipmode string, - | l_comment string - |) - |USING clickhouse - |LOCATION '$basePath/lineitem_mergetree_insertoverwrite' - |""".stripMargin) + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_mergetree_insertoverwrite + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING clickhouse + |LOCATION '$basePath/lineitem_mergetree_insertoverwrite' + |""".stripMargin) - spark.sql(s""" - | insert into table lineitem_mergetree_insertoverwrite - | select * from lineitem - |""".stripMargin) + spark.sql(s""" + | insert into table lineitem_mergetree_insertoverwrite + | select * from lineitem + |""".stripMargin) - spark.sql(s""" - | insert overwrite table lineitem_mergetree_insertoverwrite - | select * from lineitem where mod(l_orderkey,2) = 1 - |""".stripMargin) - val sql2 = - s""" - | select count(*) from lineitem_mergetree_insertoverwrite - | - |""".stripMargin - assertResult(300001)( - // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) - ) - } + spark.sql(s""" + | insert overwrite table lineitem_mergetree_insertoverwrite + | select * from lineitem where mod(l_orderkey,2) = 1 + |""".stripMargin) + val sql2 = + s""" + | select count(*) from lineitem_mergetree_insertoverwrite + | + |""".stripMargin + assertResult(300001)( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) + ) } test("test mergetree insert overwrite partitioned table with small table, static") { - withSQLConf((CHConf.ENABLE_ONEPIPELINE_MERGETREE_WRITE.key, spark35.toString)) { - spark.sql(s""" - |DROP TABLE IF EXISTS lineitem_mergetree_insertoverwrite2; - |""".stripMargin) + spark.sql(s""" + |DROP TABLE IF EXISTS lineitem_mergetree_insertoverwrite2; + |""".stripMargin) - spark.sql(s""" - |CREATE TABLE IF NOT EXISTS lineitem_mergetree_insertoverwrite2 - |( - | l_orderkey bigint, - | l_partkey bigint, - | l_suppkey bigint, - | l_linenumber bigint, - | l_quantity double, - | l_extendedprice double, - | l_discount double, - | l_tax double, - | l_returnflag string, - | l_linestatus string, - | l_shipdate date, - | l_commitdate date, - | l_receiptdate date, - | l_shipinstruct string, - | l_shipmode string, - | l_comment string - |) - |USING clickhouse - |PARTITIONED BY (l_shipdate) - |LOCATION '$basePath/lineitem_mergetree_insertoverwrite2' - |""".stripMargin) + spark.sql(s""" + |CREATE TABLE IF NOT EXISTS lineitem_mergetree_insertoverwrite2 + |( + | l_orderkey bigint, + | l_partkey bigint, + | l_suppkey bigint, + | l_linenumber bigint, + | l_quantity double, + | l_extendedprice double, + | l_discount double, + | l_tax double, + | l_returnflag string, + | l_linestatus string, + | l_shipdate date, + | l_commitdate date, + | l_receiptdate date, + | l_shipinstruct string, + | l_shipmode string, + | l_comment string + |) + |USING clickhouse + |PARTITIONED BY (l_shipdate) + |LOCATION '$basePath/lineitem_mergetree_insertoverwrite2' + |""".stripMargin) - spark.sql(s""" - | insert into table lineitem_mergetree_insertoverwrite2 - | select * from lineitem - |""".stripMargin) + spark.sql(s""" + | insert into table lineitem_mergetree_insertoverwrite2 + | select * from lineitem + |""".stripMargin) - spark.sql( - s""" - | insert overwrite table lineitem_mergetree_insertoverwrite2 - | select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' - |""".stripMargin) - val sql2 = - s""" - | select count(*) from lineitem_mergetree_insertoverwrite2 - | - |""".stripMargin - assertResult(2418)( - // total rows should remain unchanged - spark.sql(sql2).collect().apply(0).get(0) - ) - } + spark.sql( + s""" + | insert overwrite table lineitem_mergetree_insertoverwrite2 + | select * from lineitem where l_shipdate BETWEEN date'1993-02-01' AND date'1993-02-10' + |""".stripMargin) + val sql2 = + s""" + | select count(*) from lineitem_mergetree_insertoverwrite2 + | + |""".stripMargin + assertResult(2418)( + // total rows should remain unchanged + spark.sql(sql2).collect().apply(0).get(0) + ) } test("test mergetree insert overwrite partitioned table with small table, dynamic") { @@ -650,8 +645,8 @@ class GlutenClickHouseMergeTreeWriteSuite // static partition spark.sql(s""" - | insert into lineitem_mergetree_partition PARTITION (l_shipdate=date'1995-01-21', - | l_returnflag = 'A') + | insert into lineitem_mergetree_partition + | PARTITION (l_shipdate=date'1995-01-21', l_returnflag = 'A') | (l_orderkey, | l_partkey, | l_suppkey, @@ -729,7 +724,8 @@ class GlutenClickHouseMergeTreeWriteSuite ClickHouseTableV2 .getTable(fileIndex.deltaLog) .partitionColumns(1)) - val addFiles = fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) + val addFiles = + fileIndex.matchingFiles(Nil, Nil).map(f => f.asInstanceOf[AddMergeTreeParts]) assertResult(3836)(addFiles.size) assertResult(605363)(addFiles.map(_.rows).sum) @@ -739,7 +735,7 @@ class GlutenClickHouseMergeTreeWriteSuite } } - test("test mergetree write with bucket table") { + testSparkVersionLE33("test mergetree write with bucket table") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_bucket; |""".stripMargin) @@ -979,7 +975,7 @@ class GlutenClickHouseMergeTreeWriteSuite } } - test("test mergetree CTAS complex") { + test("test mergetree CTAS partition") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_ctas2; |""".stripMargin) @@ -988,8 +984,6 @@ class GlutenClickHouseMergeTreeWriteSuite |CREATE TABLE IF NOT EXISTS lineitem_mergetree_ctas2 |USING clickhouse |PARTITIONED BY (l_shipdate) - |CLUSTERED BY (l_orderkey) - |${if (spark32) "" else "SORTED BY (l_partkey, l_returnflag)"} INTO 4 BUCKETS |LOCATION '$basePath/lineitem_mergetree_ctas2' | as select * from lineitem |""".stripMargin) @@ -1598,7 +1592,7 @@ class GlutenClickHouseMergeTreeWriteSuite case scanExec: BasicScanExecTransformer => scanExec } assertResult(1)(plans.size) - assertResult(conf._2)(plans.head.getSplitInfos.size) + assertResult(conf._2)(plans.head.getSplitInfos().size) } } }) @@ -1622,12 +1616,12 @@ class GlutenClickHouseMergeTreeWriteSuite case scanExec: BasicScanExecTransformer => scanExec } assertResult(1)(plans.size) - assertResult(1)(plans.head.getSplitInfos.size) + assertResult(1)(plans.head.getSplitInfos().size) } } } - test("test mergetree with primary keys filter pruning by driver with bucket") { + testSparkVersionLE33("test mergetree with primary keys filter pruning by driver with bucket") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_pk_pruning_by_driver_bucket; |""".stripMargin) @@ -1730,7 +1724,7 @@ class GlutenClickHouseMergeTreeWriteSuite case f: BasicScanExecTransformer => f } assertResult(2)(scanExec.size) - assertResult(conf._2)(scanExec(1).getSplitInfos.size) + assertResult(conf._2)(scanExec(1).getSplitInfos().size) } } }) @@ -1776,7 +1770,7 @@ class GlutenClickHouseMergeTreeWriteSuite Seq("true", "false").foreach { skip => - withSQLConf("spark.databricks.delta.stats.skipping" -> skip.toString) { + withSQLConf("spark.databricks.delta.stats.skipping" -> skip) { val sqlStr = s""" |SELECT @@ -1799,7 +1793,7 @@ class GlutenClickHouseMergeTreeWriteSuite } } - test("test mergetree with column case sensitive") { + testSparkVersionLE33("test mergetree with column case sensitive") { spark.sql(s""" |DROP TABLE IF EXISTS LINEITEM_MERGETREE_CASE_SENSITIVE; |""".stripMargin) @@ -1838,7 +1832,7 @@ class GlutenClickHouseMergeTreeWriteSuite runTPCHQueryBySQL(6, q6("lineitem_mergetree_case_sensitive")) { _ => } } - test("test mergetree with partition with whitespace") { + testSparkVersionLE33("test mergetree with partition with whitespace") { spark.sql(s""" |DROP TABLE IF EXISTS lineitem_mergetree_partition_with_whitespace; |""".stripMargin) @@ -1900,7 +1894,7 @@ class GlutenClickHouseMergeTreeWriteSuite Seq(("-1", 3), ("3", 3), ("6", 1)).foreach( conf => { withSQLConf( - ("spark.gluten.sql.columnar.backend.ch.files.per.partition.threshold" -> conf._1)) { + "spark.gluten.sql.columnar.backend.ch.files.per.partition.threshold" -> conf._1) { val sql = s""" |select count(1), min(l_returnflag) from lineitem_split @@ -1913,7 +1907,7 @@ class GlutenClickHouseMergeTreeWriteSuite val scanExec = collect(df.queryExecution.executedPlan) { case f: FileSourceScanExecTransformer => f } - assert(scanExec(0).getPartitions.size == conf._2) + assert(scanExec.head.getPartitions.size == conf._2) } } }) diff --git a/backends-velox/pom.xml b/backends-velox/pom.xml index 48a044a17f2f..240b2218641c 100755 --- a/backends-velox/pom.xml +++ b/backends-velox/pom.xml @@ -28,6 +28,54 @@ org.apache.gluten.tags.UDFTest + + celeborn + + false + + + + org.apache.gluten + gluten-celeborn + ${project.version} + + + org.apache.celeborn + celeborn-client-spark-${spark.major.version}-shaded_${scala.binary.version} + ${celeborn.version} + provided + + + org.apache.celeborn + celeborn-client-spark-${spark.major.version}_${scala.binary.version} + + + org.apache.celeborn + celeborn-spark-${spark.major.version}-columnar-shuffle_${scala.binary.version} + + + + + + + uniffle + + false + + + + org.apache.gluten + gluten-uniffle + ${project.version} + + + org.apache.uniffle + rss-client-spark${spark.major.version}-shaded + ${uniffle.version} + provided + + + iceberg @@ -82,6 +130,29 @@ + + hudi + + + org.apache.gluten + gluten-hudi + ${project.version} + + + org.apache.gluten + gluten-hudi + ${project.version} + test-jar + test + + + org.apache.hudi + hudi-spark${sparkbundle.version}-bundle_${scala.binary.version} + ${hudi.version} + provided + + + diff --git a/gluten-celeborn/velox/src/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory b/backends-velox/src-celeborn/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory similarity index 100% rename from gluten-celeborn/velox/src/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory rename to backends-velox/src-celeborn/main/resources/META-INF/services/org.apache.spark.shuffle.gluten.celeborn.CelebornShuffleWriterFactory diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala b/backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala similarity index 100% rename from gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala rename to backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarBatchSerializer.scala diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala b/backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala similarity index 100% rename from gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala rename to backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriter.scala diff --git a/gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriterFactory.scala b/backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriterFactory.scala similarity index 100% rename from gluten-celeborn/velox/src/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriterFactory.scala rename to backends-velox/src-celeborn/main/scala/org/apache/spark/shuffle/VeloxCelebornColumnarShuffleWriterFactory.scala diff --git a/backends-velox/src-hudi/test/scala/org/apache/execution/VeloxHudiSuite.scala b/backends-velox/src-hudi/test/scala/org/apache/execution/VeloxHudiSuite.scala new file mode 100644 index 000000000000..00498f87411a --- /dev/null +++ b/backends-velox/src-hudi/test/scala/org/apache/execution/VeloxHudiSuite.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.execution + +import org.apache.gluten.execution.HudiSuite + +class VeloxHudiSuite extends HudiSuite {} diff --git a/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxTPCHHudiSuite.scala b/backends-velox/src-hudi/test/scala/org/apache/execution/VeloxTPCHHudiSuite.scala similarity index 91% rename from gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxTPCHHudiSuite.scala rename to backends-velox/src-hudi/test/scala/org/apache/execution/VeloxTPCHHudiSuite.scala index a4e10269c286..cdb3b2918080 100644 --- a/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxTPCHHudiSuite.scala +++ b/backends-velox/src-hudi/test/scala/org/apache/execution/VeloxTPCHHudiSuite.scala @@ -14,16 +14,16 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -package org.apache.gluten.execution +package org.apache.execution +import org.apache.gluten.execution.VeloxTPCHSuite import org.apache.spark.SparkConf import java.io.File class VeloxTPCHHudiSuite extends VeloxTPCHSuite { - - protected val tpchBasePath: String = new File( - "../backends-velox/src/test/resources").getAbsolutePath + protected val tpchBasePath: String = + getClass.getResource("/").getPath + "../../../src/test/resources" override protected val resourcePath: String = new File(tpchBasePath, "tpch-data-parquet").getCanonicalPath diff --git a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/gluten/uniffle/UniffleShuffleManager.java b/backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/gluten/uniffle/UniffleShuffleManager.java similarity index 100% rename from gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/gluten/uniffle/UniffleShuffleManager.java rename to backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/gluten/uniffle/UniffleShuffleManager.java diff --git a/gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java b/backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java similarity index 100% rename from gluten-uniffle/velox/src/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java rename to backends-velox/src-uniffle/main/java/org/apache/spark/shuffle/writer/VeloxUniffleColumnarShuffleWriter.java diff --git a/gluten-uniffle/velox/src/main/scala/org/apache/spark/shuffle/writer/PartitionPusher.scala b/backends-velox/src-uniffle/main/scala/org/apache/spark/shuffle/writer/PartitionPusher.scala similarity index 100% rename from gluten-uniffle/velox/src/main/scala/org/apache/spark/shuffle/writer/PartitionPusher.scala rename to backends-velox/src-uniffle/main/scala/org/apache/spark/shuffle/writer/PartitionPusher.scala diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala index c6d2bc065879..d156fffa8b21 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxTransformerApi.scala @@ -27,7 +27,7 @@ import org.apache.gluten.vectorized.PlanEvaluatorJniWrapper import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.connector.read.InputPartition -import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} import org.apache.spark.sql.sources.DataSourceRegister import org.apache.spark.sql.types._ import org.apache.spark.task.TaskResources @@ -96,16 +96,14 @@ class VeloxTransformerApi extends TransformerApi with Logging { override def packPBMessage(message: Message): Any = Any.pack(message, "") - override def genWriteParameters( - fileFormat: FileFormat, - writeOptions: Map[String, String]): Any = { - val fileFormatStr = fileFormat match { + override def genWriteParameters(write: WriteFilesExecTransformer): Any = { + val fileFormatStr = write.fileFormat match { case register: DataSourceRegister => register.shortName case _ => "UnknownFileFormat" } val compressionCodec = - WriteFilesExecTransformer.getCompressionCodec(writeOptions).capitalize + WriteFilesExecTransformer.getCompressionCodec(write.caseInsensitiveOptions).capitalize val writeParametersStr = new StringBuffer("WriteParameters:") writeParametersStr.append("is").append(compressionCodec).append("=1") writeParametersStr.append(";format=").append(fileFormatStr).append("\n") diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala index 8063a5d12207..989def88e70c 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/MiscOperatorSuite.scala @@ -1791,6 +1791,13 @@ class MiscOperatorSuite extends VeloxWholeStageTransformerSuite with AdaptiveSpa assert(plan2.find(_.isInstanceOf[ProjectExecTransformer]).isDefined) } + test("cast timestamp to date") { + val query = "select cast(ts as date) from values (timestamp'2024-01-01 00:00:00') as tab(ts)" + runQueryAndCompare(query) { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + test("timestamp broadcast join") { spark.range(0, 5).createOrReplaceTempView("right") spark.sql("SELECT id, timestamp_micros(id) as ts from right").createOrReplaceTempView("left") diff --git a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala index 46d6870b04c9..94ea8be5200d 100644 --- a/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala +++ b/backends-velox/src/test/scala/org/apache/gluten/execution/ScalarFunctionsValidateSuite.scala @@ -1357,6 +1357,26 @@ abstract class ScalarFunctionsValidateSuite extends FunctionsValidateSuite { } } + test("concat_ws") { + runQueryAndCompare("SELECT concat_ws('~~', c_comment, c_address) FROM customer LIMIT 50") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + + withTempPath { + path => + Seq[Seq[String]](Seq("ab", null, "cd", "", "ef"), Seq(null, "x", "", "y"), Seq.empty, null) + .toDF("col") + .write + .parquet(path.getCanonicalPath) + + spark.read.parquet(path.getCanonicalPath).createOrReplaceTempView("array_tbl") + + runQueryAndCompare("SELECT concat_ws('~~', col, 'end') AS res from array_tbl;") { + checkGlutenOperatorMatch[ProjectExecTransformer] + } + } + } + test("Test input_file_name function") { runQueryAndCompare("""SELECT input_file_name(), l_orderkey | from lineitem limit 100""".stripMargin) { diff --git a/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.cpp b/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.cpp index a76b4d398d97..0d57d53ff640 100644 --- a/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.cpp +++ b/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.cpp @@ -21,9 +21,8 @@ #include #include #include -#include #include -#include +#include #include #include #include @@ -103,7 +102,7 @@ void adjust_output(const DB::QueryPipelineBuilderPtr & builder, const DB::Block { throw DB::Exception( DB::ErrorCodes::LOGICAL_ERROR, - "Missmatch result columns size, input size is {}, but output size is {}", + "Mismatch result columns size, input size is {}, but output size is {}", input.columns(), output.columns()); } @@ -164,12 +163,6 @@ void addMergeTreeSinkTransform( : std::make_shared(header, partition_by, merge_tree_table, write_settings, context, stats); chain.addSource(sink); - const DB::Settings & settings = context->getSettingsRef(); - chain.addSource(std::make_shared( - header, settings[Setting::min_insert_block_size_rows], settings[Setting::min_insert_block_size_bytes])); - chain.addSource(std::make_shared( - header, settings[Setting::min_insert_block_size_rows], settings[Setting::min_insert_block_size_bytes])); - builder->addChain(std::move(chain)); } @@ -212,6 +205,7 @@ void addNormalFileWriterSinkTransform( namespace local_engine { + IMPLEMENT_GLUTEN_SETTINGS(GlutenWriteSettings, WRITE_RELATED_SETTINGS) void addSinkTransform(const DB::ContextPtr & context, const substrait::WriteRel & write_rel, const DB::QueryPipelineBuilderPtr & builder) @@ -224,12 +218,18 @@ void addSinkTransform(const DB::ContextPtr & context, const substrait::WriteRel throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Failed to unpack write optimization with local_engine::Write."); assert(write.has_common()); const substrait::NamedStruct & table_schema = write_rel.table_schema(); - auto output = TypeParser::buildBlockFromNamedStruct(table_schema); - adjust_output(builder, output); - const auto partitionCols = collect_partition_cols(output, table_schema); + auto partition_indexes = write.common().partition_col_index(); if (write.has_mergetree()) { - local_engine::MergeTreeTable merge_tree_table(write, table_schema); + MergeTreeTable merge_tree_table(write, table_schema); + auto output = TypeParser::buildBlockFromNamedStruct(table_schema, merge_tree_table.low_card_key); + adjust_output(builder, output); + + builder->addSimpleTransform( + [&](const Block & in_header) -> ProcessorPtr { return std::make_shared(in_header, false); }); + + const auto partition_by = collect_partition_cols(output, table_schema, partition_indexes); + GlutenWriteSettings write_settings = GlutenWriteSettings::get(context); if (write_settings.task_write_tmp_dir.empty()) throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "MergeTree Write Pipeline need inject relative path."); @@ -237,23 +237,35 @@ void addSinkTransform(const DB::ContextPtr & context, const substrait::WriteRel throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Non empty relative path for MergeTree table in pipeline mode."); merge_tree_table.relative_path = write_settings.task_write_tmp_dir; - addMergeTreeSinkTransform(context, builder, merge_tree_table, output, partitionCols); + addMergeTreeSinkTransform(context, builder, merge_tree_table, output, partition_by); } else - addNormalFileWriterSinkTransform(context, builder, write.common().format(), output, partitionCols); + { + auto output = TypeParser::buildBlockFromNamedStruct(table_schema); + adjust_output(builder, output); + const auto partition_by = collect_partition_cols(output, table_schema, partition_indexes); + addNormalFileWriterSinkTransform(context, builder, write.common().format(), output, partition_by); + } } - -DB::Names collect_partition_cols(const DB::Block & header, const substrait::NamedStruct & struct_) +DB::Names collect_partition_cols(const DB::Block & header, const substrait::NamedStruct & struct_, const PartitionIndexes & partition_by) { - DB::Names result; + if (partition_by.empty()) + { + assert(std::ranges::all_of( + struct_.column_types(), [](const int32_t type) { return type != ::substrait::NamedStruct::PARTITION_COL; })); + return {}; + } assert(struct_.column_types_size() == header.columns()); assert(struct_.column_types_size() == struct_.struct_().types_size()); - auto name_iter = header.begin(); - auto type_iter = struct_.column_types().begin(); - for (; name_iter != header.end(); ++name_iter, ++type_iter) - if (*type_iter == ::substrait::NamedStruct::PARTITION_COL) - result.push_back(name_iter->name); + DB::Names result; + result.reserve(partition_by.size()); + for (auto idx : partition_by) + { + assert(idx >= 0 && idx < header.columns()); + assert(struct_.column_types(idx) == ::substrait::NamedStruct::PARTITION_COL); + result.emplace_back(header.getByPosition(idx).name); + } return result; } diff --git a/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.h b/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.h index 01e0dabaaa7d..bb8c15c07d87 100644 --- a/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.h +++ b/cpp-ch/local-engine/Parser/RelParsers/WriteRelParser.h @@ -21,6 +21,7 @@ #include #include #include +#include #include namespace substrait @@ -38,9 +39,11 @@ using QueryPipelineBuilderPtr = std::unique_ptr; namespace local_engine { +using PartitionIndexes = google::protobuf::RepeatedField<::int32_t>; + void addSinkTransform(const DB::ContextPtr & context, const substrait::WriteRel & write_rel, const DB::QueryPipelineBuilderPtr & builder); -DB::Names collect_partition_cols(const DB::Block & header, const substrait::NamedStruct & struct_); +DB::Names collect_partition_cols(const DB::Block & header, const substrait::NamedStruct & struct_, const PartitionIndexes & partition_by); #define WRITE_RELATED_SETTINGS(M, ALIAS) \ M(String, task_write_tmp_dir, , "The temporary directory for writing data") \ diff --git a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.cpp b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.cpp index 6c9dd890d851..d41e71fb848d 100644 --- a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.cpp +++ b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.cpp @@ -31,27 +31,37 @@ extern const Metric GlobalThreadActive; extern const Metric GlobalThreadScheduled; } +namespace DB::Setting +{ +extern const SettingsUInt64 min_insert_block_size_rows; +extern const SettingsUInt64 min_insert_block_size_bytes; +} namespace local_engine { -void SparkMergeTreeSink::consume(Chunk & chunk) +void SparkMergeTreeSink::write(const Chunk & chunk) { - assert(!sink_helper->metadata_snapshot->hasPartitionKey()); + CurrentThread::flushUntrackedMemory(); + /// Reset earlier, so put it in the scope BlockWithPartition item{getHeader().cloneWithColumns(chunk.getColumns()), Row{}}; - size_t before_write_memory = 0; - if (auto * memory_tracker = CurrentThread::getMemoryTracker()) - { - CurrentThread::flushUntrackedMemory(); - before_write_memory = memory_tracker->get(); - } + sink_helper->writeTempPart(item, context, part_num); part_num++; - /// Reset earlier to free memory - item.block.clear(); - item.partition.clear(); +} - sink_helper->checkAndMerge(); +void SparkMergeTreeSink::consume(Chunk & chunk) +{ + Chunk tmp; + tmp.swap(chunk); + squashed_chunk = squashing.add(std::move(tmp)); + if (static_cast(squashed_chunk)) + { + write(Squashing::squash(std::move(squashed_chunk))); + sink_helper->checkAndMerge(); + } + assert(squashed_chunk.getNumRows() == 0); + assert(chunk.getNumRows() == 0); } void SparkMergeTreeSink::onStart() @@ -61,6 +71,11 @@ void SparkMergeTreeSink::onStart() void SparkMergeTreeSink::onFinish() { + assert(squashed_chunk.getNumRows() == 0); + squashed_chunk = squashing.flush(); + if (static_cast(squashed_chunk)) + write(Squashing::squash(std::move(squashed_chunk))); + assert(squashed_chunk.getNumRows() == 0); sink_helper->finish(context); if (stats_.has_value()) (*stats_)->collectStats(sink_helper->unsafeGet(), sink_helper->write_settings.partition_settings.partition_dir); @@ -91,7 +106,9 @@ SinkToStoragePtr SparkMergeTreeSink::create( } else sink_helper = std::make_shared(dest_storage, write_settings_, isRemoteStorage); - return std::make_shared(sink_helper, context, stats); + const DB::Settings & settings = context->getSettingsRef(); + return std::make_shared( + sink_helper, context, stats, settings[Setting::min_insert_block_size_rows], settings[Setting::min_insert_block_size_bytes]); } SinkHelper::SinkHelper(const SparkStorageMergeTreePtr & data_, const SparkMergeTreeWriteSettings & write_settings_, bool isRemoteStorage_) diff --git a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.h b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.h index b551d86d1d0c..828332d2d6c9 100644 --- a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.h +++ b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeSink.h @@ -227,8 +227,17 @@ class SparkMergeTreeSink : public DB::SinkToStorage const DB::ContextMutablePtr & context, const SinkStatsOption & stats = {}); - explicit SparkMergeTreeSink(const SinkHelperPtr & sink_helper_, const ContextPtr & context_, const SinkStatsOption & stats) - : SinkToStorage(sink_helper_->metadata_snapshot->getSampleBlock()), context(context_), sink_helper(sink_helper_), stats_(stats) + explicit SparkMergeTreeSink( + const SinkHelperPtr & sink_helper_, + const ContextPtr & context_, + const SinkStatsOption & stats, + size_t min_block_size_rows, + size_t min_block_size_bytes) + : SinkToStorage(sink_helper_->metadata_snapshot->getSampleBlock()) + , context(context_) + , sink_helper(sink_helper_) + , stats_(stats) + , squashing(sink_helper_->metadata_snapshot->getSampleBlock(), min_block_size_rows, min_block_size_bytes) { } ~SparkMergeTreeSink() override = default; @@ -241,9 +250,13 @@ class SparkMergeTreeSink : public DB::SinkToStorage const SinkHelper & sinkHelper() const { return *sink_helper; } private: + void write(const Chunk & chunk); + ContextPtr context; SinkHelperPtr sink_helper; std::optional> stats_; + Squashing squashing; + Chunk squashed_chunk; int part_num = 1; }; diff --git a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeWriter.cpp b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeWriter.cpp index a8fdfff6ff75..95145d43fab9 100644 --- a/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeWriter.cpp +++ b/cpp-ch/local-engine/Storages/MergeTree/SparkMergeTreeWriter.cpp @@ -18,8 +18,6 @@ #include #include -#include -#include #include #include #include @@ -28,11 +26,6 @@ #include #include -namespace DB::Setting -{ -extern const SettingsUInt64 min_insert_block_size_rows; -extern const SettingsUInt64 min_insert_block_size_bytes; -} using namespace DB; namespace { @@ -125,12 +118,6 @@ std::unique_ptr SparkMergeTreeWriter::create( // // auto stats = std::make_shared(header, sink_helper); // chain.addSink(stats); - // - chain.addSource(std::make_shared( - header, settings[Setting::min_insert_block_size_rows], settings[Setting::min_insert_block_size_bytes])); - chain.addSource(std::make_shared( - header, settings[Setting::min_insert_block_size_rows], settings[Setting::min_insert_block_size_bytes])); - return std::make_unique(header, sink_helper, QueryPipeline{std::move(chain)}, spark_job_id); } diff --git a/cpp-ch/local-engine/tests/gtest_write_pipeline.cpp b/cpp-ch/local-engine/tests/gtest_write_pipeline.cpp index a01dd363c56c..a36601d6afa5 100644 --- a/cpp-ch/local-engine/tests/gtest_write_pipeline.cpp +++ b/cpp-ch/local-engine/tests/gtest_write_pipeline.cpp @@ -146,7 +146,7 @@ TEST(WritePipeline, SubstraitFileSink) DB::Names expected{"s_suppkey", "s_name", "s_address", "s_nationkey", "s_phone", "s_acctbal", "s_comment111"}; EXPECT_EQ(expected, names); - auto partitionCols = collect_partition_cols(block, table_schema); + auto partitionCols = collect_partition_cols(block, table_schema, {}); DB::Names expected_partition_cols; EXPECT_EQ(expected_partition_cols, partitionCols); @@ -164,7 +164,7 @@ TEST(WritePipeline, SubstraitFileSink) INCBIN(native_write_one_partition, SOURCE_DIR "/utils/extern-local-engine/tests/json/native_write_one_partition.json"); -TEST(WritePipeline, SubstraitPartitionedFileSink) +/*TEST(WritePipeline, SubstraitPartitionedFileSink) { const auto context = DB::Context::createCopy(QueryContext::globalContext()); GlutenWriteSettings settings{ @@ -193,7 +193,7 @@ TEST(WritePipeline, SubstraitPartitionedFileSink) DB::Names expected{"s_suppkey", "s_name", "s_address", "s_phone", "s_acctbal", "s_comment", "s_nationkey"}; EXPECT_EQ(expected, names); - auto partitionCols = local_engine::collect_partition_cols(block, table_schema); + auto partitionCols = local_engine::collect_partition_cols(block, table_schema, {}); DB::Names expected_partition_cols{"s_nationkey"}; EXPECT_EQ(expected_partition_cols, partitionCols); @@ -201,12 +201,12 @@ TEST(WritePipeline, SubstraitPartitionedFileSink) const Block & x = *local_executor->nextColumnar(); debug::headBlock(x, 25); EXPECT_EQ(25, x.rows()); -} +}*/ TEST(WritePipeline, ComputePartitionedExpression) { const auto context = DB::Context::createCopy(QueryContext::globalContext()); - + Block sample_block{{STRING(), "name"}, {UINT(), "s_nationkey"}}; auto partition_by = SubstraitPartitionedFileSink::make_partition_expression({"s_nationkey", "name"}, sample_block); // auto partition_by = printColumn("s_nationkey"); diff --git a/cpp-ch/local-engine/tests/gtest_write_pipeline_mergetree.cpp b/cpp-ch/local-engine/tests/gtest_write_pipeline_mergetree.cpp index 1ad90060f475..a5cd3fd7f39c 100644 --- a/cpp-ch/local-engine/tests/gtest_write_pipeline_mergetree.cpp +++ b/cpp-ch/local-engine/tests/gtest_write_pipeline_mergetree.cpp @@ -258,11 +258,18 @@ TEST(MergeTree, SparkMergeTree) INCBIN(_3_mergetree_plan_input_, SOURCE_DIR "/utils/extern-local-engine/tests/json/mergetree/lineitem_parquet_input.json"); namespace { -void writeMerge(std::string_view json_plan, - const std::string & outputPath , - const std::function & callback, std::optional input = std::nullopt) +void writeMerge( + std::string_view json_plan, + const std::string & outputPath, + const std::function & callback, + std::optional input = std::nullopt) { const auto context = DB::Context::createCopy(QueryContext::globalContext()); + + auto queryid = QueryContext::instance().initializeQuery("gtest_mergetree"); + SCOPE_EXIT({ QueryContext::instance().finalizeQuery(queryid); }); + + GlutenWriteSettings settings{.task_write_tmp_dir = outputPath}; settings.set(context); SparkMergeTreeWritePartitionSettings partition_settings{.part_name_prefix = "pipline_prefix"}; @@ -279,18 +286,24 @@ INCBIN(_3_mergetree_plan_, SOURCE_DIR "/utils/extern-local-engine/tests/json/mer INCBIN(_4_mergetree_plan_, SOURCE_DIR "/utils/extern-local-engine/tests/json/mergetree/4_one_pipeline.json"); TEST(MergeTree, Pipeline) { - writeMerge(EMBEDDED_PLAN(_3_mergetree_plan_),"tmp/lineitem_mergetree",[&](const DB::Block & block) - { - EXPECT_EQ(1, block.rows()); - debug::headBlock(block); - }); + writeMerge( + EMBEDDED_PLAN(_3_mergetree_plan_), + "tmp/lineitem_mergetree", + [&](const DB::Block & block) + { + EXPECT_EQ(1, block.rows()); + debug::headBlock(block); + }); } TEST(MergeTree, PipelineWithPartition) { - writeMerge(EMBEDDED_PLAN(_4_mergetree_plan_),"tmp/lineitem_mergetree_p",[&](const DB::Block & block) - { - EXPECT_EQ(2525, block.rows()); - debug::headBlock(block); - }); + writeMerge( + EMBEDDED_PLAN(_4_mergetree_plan_), + "tmp/lineitem_mergetree_p", + [&](const DB::Block & block) + { + EXPECT_EQ(3815, block.rows()); + debug::headBlock(block); + }); } \ No newline at end of file diff --git a/cpp-ch/local-engine/tests/json/mergetree/4_one_pipeline.json b/cpp-ch/local-engine/tests/json/mergetree/4_one_pipeline.json index 14a9b3dda2ad..513f54a707d4 100644 --- a/cpp-ch/local-engine/tests/json/mergetree/4_one_pipeline.json +++ b/cpp-ch/local-engine/tests/json/mergetree/4_one_pipeline.json @@ -9,13 +9,18 @@ "optimization": { "@type": "type.googleapis.com/local_engine.Write", "common": { - "format": "mergetree" + "format": "mergetree", + "partitionColIndex": [ + 10, + 8 + ] }, "mergetree": { "database": "default", - "table": "lineitem_mergetree_insertoverwrite2", - "snapshotId": "1731309448915_0", - "orderByKey": "tuple()", + "table": "lineitem_mergetree_partition", + "snapshotId": "1734145864855_0", + "orderByKey": "l_orderkey", + "primaryKey": "l_orderkey", "storagePolicy": "default" } }, @@ -221,7 +226,7 @@ "NORMAL_COL", "NORMAL_COL", "NORMAL_COL", - "NORMAL_COL", + "PARTITION_COL", "NORMAL_COL", "PARTITION_COL", "NORMAL_COL", @@ -232,138 +237,171 @@ ] }, "input": { - "read": { + "sort": { "common": { "direct": {} }, - "baseSchema": { - "names": [ - "l_orderkey", - "l_partkey", - "l_suppkey", - "l_linenumber", - "l_quantity", - "l_extendedprice", - "l_discount", - "l_tax", - "l_returnflag", - "l_linestatus", - "l_shipdate", - "l_commitdate", - "l_receiptdate", - "l_shipinstruct", - "l_shipmode", - "l_comment" - ], - "struct": { - "types": [ - { - "i64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "i64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "i64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "i64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "fp64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "fp64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "fp64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "fp64": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "string": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "string": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "date": { - "nullability": "NULLABILITY_NULLABLE" - } + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "l_orderkey", + "l_partkey", + "l_suppkey", + "l_linenumber", + "l_quantity", + "l_extendedprice", + "l_discount", + "l_tax", + "l_returnflag", + "l_linestatus", + "l_shipdate", + "l_commitdate", + "l_receiptdate", + "l_shipinstruct", + "l_shipmode", + "l_comment" + ], + "struct": { + "types": [ + { + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fp64": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "string": { + "nullability": "NULLABILITY_NULLABLE" + } + } + ] }, - { - "date": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "date": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "string": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "string": { - "nullability": "NULLABILITY_NULLABLE" - } - }, - { - "string": { - "nullability": "NULLABILITY_NULLABLE" + "columnTypes": [ + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL", + "NORMAL_COL" + ] + }, + "advancedExtension": { + "optimization": { + "@type": "type.googleapis.com/google.protobuf.StringValue", + "value": "isMergeTree=0\n" + } + } + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } } } - ] + }, + "direction": "SORT_DIRECTION_ASC_NULLS_FIRST" }, - "columnTypes": [ - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL", - "NORMAL_COL" - ] - }, - "advancedExtension": { - "optimization": { - "@type": "type.googleapis.com/google.protobuf.StringValue", - "value": "isMergeTree=0\n" + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_FIRST" } - } + ] } } } diff --git a/cpp/core/utils/Timer.h b/cpp/core/utils/Timer.h index b6dec29b1a6a..4fe39068bb77 100644 --- a/cpp/core/utils/Timer.h +++ b/cpp/core/utils/Timer.h @@ -19,11 +19,11 @@ #include -using TimePoint = std::chrono::time_point; - namespace gluten { +template class Timer { public: + using TimePoint = std::chrono::time_point; explicit Timer() = default; void start() { @@ -36,8 +36,7 @@ class Timer { return; } running_ = false; - realTimeUsed_ += - std::chrono::duration_cast(std::chrono::steady_clock::now() - startTime_).count(); + realTimeUsed_ += std::chrono::duration_cast(std::chrono::steady_clock::now() - startTime_).count(); } void reset() { @@ -62,13 +61,14 @@ class Timer { int64_t realTimeUsed_ = 0; }; -class ScopedTimer { +template +class ScopedTimerImpl { public: - explicit ScopedTimer(int64_t* toAdd) : toAdd_(toAdd) { + explicit ScopedTimerImpl(int64_t* toAdd) : toAdd_(toAdd) { startInternal(); } - ~ScopedTimer() { + ~ScopedTimerImpl() { stopInternal(); } @@ -79,7 +79,7 @@ class ScopedTimer { } private: - Timer timer_{}; + Timer timer_{}; int64_t* toAdd_; void stopInternal() { @@ -92,4 +92,10 @@ class ScopedTimer { timer_.start(); } }; + +using ScopedTimer = ScopedTimerImpl; +using ScopedSecondsTimer = ScopedTimerImpl; +using ScopedMillisecondsTimer = ScopedTimerImpl; +using ScopedMicrosecondsTimer = ScopedTimerImpl; + } // namespace gluten diff --git a/cpp/velox/CMakeLists.txt b/cpp/velox/CMakeLists.txt index 9fb8b6c6a6af..6266ae5e2f94 100644 --- a/cpp/velox/CMakeLists.txt +++ b/cpp/velox/CMakeLists.txt @@ -119,10 +119,7 @@ macro(find_awssdk) endmacro() macro(find_gcssdk) - set(CMAKE_FIND_LIBRARY_SUFFIXES_BCK ${CMAKE_FIND_LIBRARY_SUFFIXES}) - set(CMAKE_FIND_LIBRARY_SUFFIXES ".a") find_package(google_cloud_cpp_storage CONFIG 2.22.0 REQUIRED) - set(CMAKE_FIND_LIBRARY_SUFFIXES ${CMAKE_FIND_LIBRARY_SUFFIXES_BCK}) endmacro() macro(find_azure) diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc index 682bf0fcd5d6..996b3bdce033 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc @@ -61,7 +61,6 @@ const std::unordered_set kRegexFunctions = { const std::unordered_set kBlackList = { "split_part", "factorial", - "concat_ws", "from_json", "json_array_length", "trunc", @@ -300,10 +299,13 @@ bool SubstraitToVeloxPlanValidator::validateCast( case TypeKind::VARBINARY: LOG_VALIDATION_MSG("Invalid input type in casting: ARRAY/MAP/ROW/VARBINARY."); return false; - case TypeKind::TIMESTAMP: { - LOG_VALIDATION_MSG("Casting from TIMESTAMP is not supported or has incorrect result."); - return false; - } + case TypeKind::TIMESTAMP: + // Only support cast timestamp to date + if (!toType->isDate()) { + LOG_VALIDATION_MSG( + "Casting from TIMESTAMP to " + toType->toString() + " is not supported or has incorrect result."); + return false; + } default: { } } diff --git a/docs/developers/HowTo.md b/docs/developers/HowTo.md index 22ad3e30efc7..dce32d55c02d 100644 --- a/docs/developers/HowTo.md +++ b/docs/developers/HowTo.md @@ -134,16 +134,16 @@ to let it override the corresponding C standard functions entirely. It may help Now, both Parquet and DWRF format files are supported, related scripts and files are under the directory of `${GLUTEN_HOME}/backends-velox/workload/tpch`. The file `README.md` under `${GLUTEN_HOME}/backends-velox/workload/tpch` offers some useful help, but it's still not enough and exact. -One way of run TPC-H test is to run velox-be by workflow, you can refer to [velox_be.yml](https://github.com/apache/incubator-gluten/blob/main/.github/workflows/velox_be.yml#L90) +One way of run TPC-H test is to run velox-be by workflow, you can refer to [velox_backend.yml](https://github.com/apache/incubator-gluten/blob/main/.github/workflows/velox_backend.yml#L280) Here we will explain how to run TPC-H on Velox backend with the Parquet file format. 1. First, prepare the datasets, you have two choices. - - One way, generate Parquet datasets using the script under `${GLUTEN_HOME}/backends-velox/workload/tpch/gen_data/parquet_dataset`, you can get help from the above + - One way, generate Parquet datasets using the script under `${GLUTEN_HOME}/tools/workload/tpch/gen_data/parquet_dataset`, you can get help from the above -mentioned `README.md`. - The other way, using the small dataset under `${GLUTEN_HOME}/backends-velox/src/test/resources/tpch-data-parquet` directly, if you just want to make simple TPC-H testing, this dataset is a good choice. 2. Second, run TPC-H on Velox backend testing. - - Modify `${GLUTEN_HOME}/backends-velox/workload/tpch/run_tpch/tpch_parquet.scala`. + - Modify `${GLUTEN_HOME}/tools/workload/tpch/run_tpch/tpch_parquet.scala`. - Set `var parquet_file_path` to correct directory. If using the small dataset directly in the step one, then modify it as below: ```scala @@ -156,12 +156,12 @@ Here we will explain how to run TPC-H on Velox backend with the Parquet file for var gluten_root = "/home/gluten" ``` - - Modify `${GLUTEN_HOME}/backends-velox/workload/tpch/run_tpch/tpch_parquet.sh`. + - Modify `${GLUTEN_HOME}/tools/workload/tpch/run_tpch/tpch_parquet.sh`. - Set `GLUTEN_JAR` correctly. Please refer to the section of [Build Gluten with Velox Backend](../get-started/Velox.md/#2-build-gluten-with-velox-backend) - Set `SPARK_HOME` correctly. - Set the memory configurations appropriately. - Execute `tpch_parquet.sh` using the below command. - - `cd ${GLUTEN_HOME}/backends-velox/workload/tpch/run_tpch/` + - `cd ${GLUTEN_HOME}/tools/workload/tpch/run_tpch/` - `./tpch_parquet.sh` # How to run TPC-DS diff --git a/ep/build-clickhouse/src/package.sh b/ep/build-clickhouse/src/package.sh index 2583727b212e..06ca63c5d4d9 100755 --- a/ep/build-clickhouse/src/package.sh +++ b/ep/build-clickhouse/src/package.sh @@ -90,9 +90,8 @@ function build_gluten_by_spark_version() { sv=$(echo "$spark_profile" | tr -d '.') echo "build gluten with spark ${spark_profile}, scala ${scala_version}" - mvn clean install -Pbackends-clickhouse -Pspark-"${spark_profile}" -Pscala-"${scala_version}" -Pceleborn -Piceberg -DskipTests -Dcheckstyle.skip + mvn clean install -Pbackends-clickhouse -Pspark-"${spark_profile}" -Pscala-"${scala_version}" -Pceleborn -Piceberg -Pdelta -DskipTests -Dcheckstyle.skip cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-spark-"${spark_profile}"-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark"${sv}"/gluten.jar - cp "${GLUTEN_SOURCE}"/gluten-celeborn/clickhouse/target/gluten-celeborn-clickhouse-"${PROJECT_VERSION}"-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark"${sv}" delta_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-"${spark_profile}" --non-recursive exec:exec) delta_package_name=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.package.name}' -Pspark-"${spark_profile}" --non-recursive exec:exec) wget https://repo1.maven.org/maven2/io/delta/"${delta_package_name}"_${scala_version}/"${delta_version}"/"${delta_package_name}"_${scala_version}-"${delta_version}".jar -P "${PACKAGE_DIR_PATH}"/jars/spark"${sv}" diff --git a/gluten-arrow/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java b/gluten-arrow/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java index b221db13e375..9d63a8601b4d 100644 --- a/gluten-arrow/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java +++ b/gluten-arrow/src/main/java/org/apache/gluten/memory/listener/ReservationListeners.java @@ -52,18 +52,7 @@ private static ReservationListener create0( tmm, name, Spillers.withMinSpillSize(spiller, reservationBlockSize), mutableStats); final MemoryTarget overConsumer = MemoryTargets.newConsumer( - tmm, - consumer.name() + ".OverAcquire", - new Spiller() { - @Override - public long spill(MemoryTarget self, Phase phase, long size) { - if (!Spillers.PHASE_SET_ALL.contains(phase)) { - return 0L; - } - return self.repay(size); - } - }, - Collections.emptyMap()); + tmm, consumer.name() + ".OverAcquire", Spillers.NOOP, Collections.emptyMap()); final MemoryTarget target = MemoryTargets.throwOnOom( MemoryTargets.overAcquire( diff --git a/gluten-celeborn/clickhouse/pom.xml b/gluten-celeborn/clickhouse/pom.xml deleted file mode 100755 index 21263443d735..000000000000 --- a/gluten-celeborn/clickhouse/pom.xml +++ /dev/null @@ -1,260 +0,0 @@ - - - - gluten-celeborn - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gluten-celeborn-clickhouse - jar - Gluten Celeborn Clickhouse - - - - org.apache.gluten - backends-clickhouse - ${project.version} - provided - - - org.apache.gluten - backends-clickhouse - ${project.version} - test-jar - test - - - org.apache.gluten - gluten-substrait - ${project.version} - test-jar - test - - - org.apache.gluten - gluten-celeborn-common - ${project.version} - compile - - - org.apache.spark - spark-core_${scala.binary.version} - test-jar - test - - - org.apache.spark - spark-sql_${scala.binary.version} - test-jar - test - - - org.apache.spark - spark-catalyst_${scala.binary.version} - test-jar - test - - - org.apache.spark - spark-yarn_${scala.binary.version} - ${spark.version} - test-jar - test - - - org.apache.spark - spark-hive_${scala.binary.version} - ${spark.version} - test-jar - test - - - org.apache.spark - spark-hive_${scala.binary.version} - test - - - org.apache.hive.hcatalog - hive-hcatalog-core - 2.3.9 - test - - - org.pentaho - pentaho-aggdesigner-algorithm - - - net.minidev - json-smart - - - org.apache.hive - hive-exec - - - guava - com.google.guava - - - hadoop-common - org.apache.hadoop - - - hadoop-hdfs - org.apache.hadoop - - - - - io.delta - ${delta.package.name}_${scala.binary.version} - test - - - junit - junit - - - org.mockito - mockito-core - 2.23.4 - test - - - org.scalatestplus - scalatestplus-mockito_${scala.binary.version} - 1.0.0-M2 - test - - - org.scalatest - scalatest_${scala.binary.version} - test - - - org.scalatestplus - scalatestplus-scalacheck_${scala.binary.version} - 3.1.0.0-RC2 - test - - - org.apache.hadoop - hadoop-client - ${hadoop.version} - test - - - org.apache.arrow - arrow-memory-core - ${arrow.version} - provided - - - io.netty - netty-common - - - io.netty - netty-buffer - - - - - org.apache.arrow - arrow-vector - ${arrow.version} - provided - - - io.netty - netty-common - - - io.netty - netty-buffer - - - - - - - target/scala-${scala.binary.version}/classes - target/scala-${scala.binary.version}/test-classes - - - org.apache.maven.plugins - maven-resources-plugin - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - org.scalastyle - scalastyle-maven-plugin - - - org.apache.maven.plugins - maven-checkstyle-plugin - - - maven-assembly-plugin - 3.3.0 - - - jar-with-dependencies - - - - - make-assembly - package - - single - - - - - - org.scalatest - scalatest-maven-plugin - - - test - - test - - - - ${clickhouse.lib.path} - ${tpcds.data.path} - - - - - - - org.apache.maven.plugins - maven-jar-plugin - - - prepare-test-jar - test-compile - - test-jar - - - - - - - diff --git a/gluten-celeborn/common/pom.xml b/gluten-celeborn/common/pom.xml deleted file mode 100755 index da7e68987659..000000000000 --- a/gluten-celeborn/common/pom.xml +++ /dev/null @@ -1,47 +0,0 @@ - - - - gluten-celeborn - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gluten-celeborn-common - jar - Gluten Celeborn Common - - - target/scala-${scala.binary.version}/classes - target/scala-${scala.binary.version}/test-classes - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - org.scalastyle - scalastyle-maven-plugin - - - org.apache.maven.plugins - maven-checkstyle-plugin - - - com.diffplug.spotless - spotless-maven-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - - diff --git a/gluten-celeborn/package/pom.xml b/gluten-celeborn/package/pom.xml deleted file mode 100644 index 7b18787b4e16..000000000000 --- a/gluten-celeborn/package/pom.xml +++ /dev/null @@ -1,38 +0,0 @@ - - 4.0.0 - - gluten-celeborn - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - - gluten-celeborn-package - jar - Gluten Celeborn Package - - - - backends-velox - - - org.apache.gluten - gluten-celeborn-velox - ${project.version} - - - - - backends-clickhouse - - - org.apache.gluten - gluten-celeborn-clickhouse - ${project.version} - - - - - diff --git a/gluten-celeborn/pom.xml b/gluten-celeborn/pom.xml index de19132b38f8..0eca5da979e1 100755 --- a/gluten-celeborn/pom.xml +++ b/gluten-celeborn/pom.xml @@ -11,7 +11,7 @@ 4.0.0 gluten-celeborn - pom + jar Gluten Celeborn @@ -56,50 +56,19 @@ - - - - net.alchim31.maven - scala-maven-plugin - - true - - -Xss128m - - - - - org.scalastyle - scalastyle-maven-plugin - - - com.diffplug.spotless - spotless-maven-plugin - - - + + + net.alchim31.maven + scala-maven-plugin + + + org.scalastyle + scalastyle-maven-plugin + + + com.diffplug.spotless + spotless-maven-plugin + + - - - - backends-velox - - - - velox - common - package - - - - backends-clickhouse - - - - clickhouse - common - package - - - diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java b/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java similarity index 100% rename from gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java rename to gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleManager.java diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleWriterFactory.java b/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleWriterFactory.java similarity index 100% rename from gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleWriterFactory.java rename to gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornShuffleWriterFactory.java diff --git a/gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java b/gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java similarity index 100% rename from gluten-celeborn/common/src/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java rename to gluten-celeborn/src-celeborn/main/java/org/apache/spark/shuffle/gluten/celeborn/CelebornUtils.java diff --git a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala b/gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala similarity index 99% rename from gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala rename to gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala index 10cf06a3ce59..42e939e4420d 100644 --- a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala +++ b/gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornColumnarShuffleWriter.scala @@ -16,8 +16,9 @@ */ package org.apache.spark.shuffle +import org.apache.celeborn.client.ShuffleClient +import org.apache.celeborn.common.CelebornConf import org.apache.gluten.GlutenConfig - import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.internal.config.SHUFFLE_COMPRESS @@ -26,9 +27,6 @@ import org.apache.spark.shuffle.celeborn.CelebornShuffleHandle import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.storage.BlockManager -import org.apache.celeborn.client.ShuffleClient -import org.apache.celeborn.common.CelebornConf - import java.io.IOException import java.util.Locale diff --git a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala b/gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala similarity index 99% rename from gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala rename to gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala index 2f59307230a0..545a4c113936 100644 --- a/gluten-celeborn/common/src/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala +++ b/gluten-celeborn/src-celeborn/main/scala/org/apache/spark/shuffle/CelebornPartitionPusher.scala @@ -16,11 +16,10 @@ */ package org.apache.spark.shuffle +import org.apache.celeborn.client.ShuffleClient import org.apache.spark._ import org.apache.spark.internal.Logging -import org.apache.celeborn.client.ShuffleClient - import java.io.IOException class CelebornPartitionPusher( diff --git a/gluten-celeborn/velox/pom.xml b/gluten-celeborn/velox/pom.xml deleted file mode 100755 index 55aa8f3c9b5f..000000000000 --- a/gluten-celeborn/velox/pom.xml +++ /dev/null @@ -1,68 +0,0 @@ - - - - gluten-celeborn - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gluten-celeborn-velox - jar - Gluten Celeborn Velox - - - - org.apache.gluten - backends-velox - ${project.version} - provided - - - org.apache.gluten - gluten-arrow - ${project.version} - provided - - - org.apache.gluten - gluten-celeborn-common - ${project.version} - compile - - - - - target/scala-${scala.binary.version}/classes - target/scala-${scala.binary.version}/test-classes - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - org.scalastyle - scalastyle-maven-plugin - - - org.apache.maven.plugins - maven-checkstyle-plugin - - - com.diffplug.spotless - spotless-maven-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - - diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargetVisitor.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargetVisitor.java index e58dbb295b08..f6ef49a78920 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargetVisitor.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargetVisitor.java @@ -28,11 +28,13 @@ public interface MemoryTargetVisitor { T visit(TreeMemoryConsumer treeMemoryConsumer); - T visit(TreeMemoryTargets.Node node); + T visit(TreeMemoryConsumer.Node node); T visit(LoggingMemoryTarget loggingMemoryTarget); T visit(NoopMemoryTarget noopMemoryTarget); T visit(DynamicOffHeapSizingMemoryTarget dynamicOffHeapSizingMemoryTarget); + + T visit(RetryOnOomMemoryTarget retryOnOomMemoryTarget); } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java index 6f7cc9bd9c9c..c6f5b59de8c2 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/MemoryTargets.java @@ -20,12 +20,17 @@ import org.apache.gluten.memory.MemoryUsageStatsBuilder; import org.apache.gluten.memory.memtarget.spark.TreeMemoryConsumers; +import org.apache.spark.SparkEnv; import org.apache.spark.annotation.Experimental; import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.util.SparkResourceUtil; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.util.Map; public final class MemoryTargets { + private static final Logger LOGGER = LoggerFactory.getLogger(MemoryTargets.class); private MemoryTargets() { // enclose factory ctor @@ -57,13 +62,35 @@ public static TreeMemoryTarget newConsumer( String name, Spiller spiller, Map virtualChildren) { - final TreeMemoryConsumers.Factory factory; + final TreeMemoryConsumers.Factory factory = TreeMemoryConsumers.factory(tmm); if (GlutenConfig.getConf().memoryIsolation()) { - factory = TreeMemoryConsumers.isolated(); - } else { - factory = TreeMemoryConsumers.shared(); + return TreeMemoryTargets.newChild(factory.isolatedRoot(), name, spiller, virtualChildren); } - - return factory.newConsumer(tmm, name, spiller, virtualChildren); + final TreeMemoryTarget root = factory.legacyRoot(); + final TreeMemoryTarget consumer = + TreeMemoryTargets.newChild(root, name, spiller, virtualChildren); + if (SparkEnv.get() == null) { + // We are likely in test code. Return the consumer directly. + LOGGER.info("SparkEnv not found. We are likely in test code."); + return consumer; + } + final int taskSlots = SparkResourceUtil.getTaskSlots(SparkEnv.get().conf()); + if (taskSlots == 1) { + // We don't need to retry on OOM in the case one single task occupies the whole executor. + return consumer; + } + // Since https://github.com/apache/incubator-gluten/pull/8132. + // Retry of spilling is needed in multi-slot and legacy mode (formerly named as share mode) + // because the maxMemoryPerTask defined by vanilla Spark's ExecutionMemoryPool is dynamic. + // + // See the original issue https://github.com/apache/incubator-gluten/issues/8128. + return new RetryOnOomMemoryTarget( + consumer, + () -> { + LOGGER.info("Request for spilling on consumer {}...", consumer.name()); + // Note: Spill from root node so other consumers also get spilled. + long spilled = TreeMemoryTargets.spillTree(root, Long.MAX_VALUE); + LOGGER.info("Consumer {} spilled {} bytes.", consumer.name(), spilled); + }); } } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java index e7321b4b7e0e..7724083d6852 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/OverAcquire.java @@ -57,13 +57,15 @@ public long borrow(long size) { } Preconditions.checkState(overTarget.usedBytes() == 0); long granted = target.borrow(size); - long majorSize = target.usedBytes(); - long overSize = (long) (ratio * majorSize); - long overAcquired = overTarget.borrow(overSize); - Preconditions.checkState(overAcquired == overTarget.usedBytes()); - long releasedOverSize = overTarget.repay(overAcquired); - Preconditions.checkState(releasedOverSize == overAcquired); - Preconditions.checkState(overTarget.usedBytes() == 0); + if (granted >= size) { + long majorSize = target.usedBytes(); + long overSize = (long) (ratio * majorSize); + long overAcquired = overTarget.borrow(overSize); + Preconditions.checkState(overAcquired == overTarget.usedBytes()); + long releasedOverSize = overTarget.repay(overAcquired); + Preconditions.checkState(releasedOverSize == overAcquired); + Preconditions.checkState(overTarget.usedBytes() == 0); + } return granted; } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/RetryOnOomMemoryTarget.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/RetryOnOomMemoryTarget.java new file mode 100644 index 000000000000..b564bbcaa41c --- /dev/null +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/RetryOnOomMemoryTarget.java @@ -0,0 +1,106 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.memory.memtarget; + +import org.apache.gluten.memory.MemoryUsageStatsBuilder; +import org.apache.gluten.proto.MemoryUsageStats; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.Map; + +public class RetryOnOomMemoryTarget implements TreeMemoryTarget { + private static final Logger LOGGER = LoggerFactory.getLogger(RetryOnOomMemoryTarget.class); + private final TreeMemoryTarget target; + private final Runnable onRetry; + + RetryOnOomMemoryTarget(TreeMemoryTarget target, Runnable onRetry) { + this.target = target; + this.onRetry = onRetry; + } + + @Override + public long borrow(long size) { + long granted = target.borrow(size); + if (granted < size) { + LOGGER.info("Granted size {} is less than requested size {}, retrying...", granted, size); + final long remaining = size - granted; + // Invoke the `onRetry` callback, then retry borrowing. + // It's usually expected to run extra spilling logics in + // the `onRetry` callback so we may get enough memory space + // to allocate the remaining bytes. + onRetry.run(); + granted += target.borrow(remaining); + LOGGER.info("Newest granted size after retrying: {}, requested size {}.", granted, size); + } + return granted; + } + + @Override + public long repay(long size) { + return target.repay(size); + } + + @Override + public long usedBytes() { + return target.usedBytes(); + } + + @Override + public T accept(MemoryTargetVisitor visitor) { + return visitor.visit(this); + } + + @Override + public String name() { + return target.name(); + } + + @Override + public MemoryUsageStats stats() { + return target.stats(); + } + + @Override + public TreeMemoryTarget newChild( + String name, + long capacity, + Spiller spiller, + Map virtualChildren) { + return target.newChild(name, capacity, spiller, virtualChildren); + } + + @Override + public Map children() { + return target.children(); + } + + @Override + public TreeMemoryTarget parent() { + return target.parent(); + } + + @Override + public Spiller getNodeSpiller() { + return target.getNodeSpiller(); + } + + public TreeMemoryTarget target() { + return target; + } +} diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java index 26c6ea48008a..6d94e7206959 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/TreeMemoryTargets.java @@ -17,18 +17,10 @@ package org.apache.gluten.memory.memtarget; import org.apache.gluten.memory.MemoryUsageStatsBuilder; -import org.apache.gluten.memory.SimpleMemoryUsageRecorder; -import org.apache.gluten.proto.MemoryUsageStats; -import com.google.common.base.Preconditions; -import org.apache.spark.util.Utils; - -import java.util.Collections; -import java.util.HashMap; import java.util.Map; import java.util.PriorityQueue; import java.util.Queue; -import java.util.stream.Collectors; public class TreeMemoryTargets { @@ -36,13 +28,16 @@ private TreeMemoryTargets() { // enclose factory ctor } - public static TreeMemoryTarget newChild( + /** + * A short-cut method to create a child target of `parent`. The child will follow the parent's + * maximum capacity. + */ + static TreeMemoryTarget newChild( TreeMemoryTarget parent, String name, - long capacity, Spiller spiller, Map virtualChildren) { - return new Node(parent, name, capacity, spiller, virtualChildren); + return parent.newChild(name, TreeMemoryTarget.CAPACITY_UNLIMITED, spiller, virtualChildren); } public static long spillTree(TreeMemoryTarget node, final long bytes) { @@ -83,145 +78,4 @@ private static long spillTree(TreeMemoryTarget node, Spiller.Phase phase, final return bytes - remainingBytes; } - - // non-root nodes are not Spark memory consumer - public static class Node implements TreeMemoryTarget, KnownNameAndStats { - private final Map children = new HashMap<>(); - private final TreeMemoryTarget parent; - private final String name; - private final long capacity; - private final Spiller spiller; - private final Map virtualChildren; - private final SimpleMemoryUsageRecorder selfRecorder = new SimpleMemoryUsageRecorder(); - - private Node( - TreeMemoryTarget parent, - String name, - long capacity, - Spiller spiller, - Map virtualChildren) { - this.parent = parent; - this.capacity = capacity; - final String uniqueName = MemoryTargetUtil.toUniqueName(name); - if (capacity == CAPACITY_UNLIMITED) { - this.name = uniqueName; - } else { - this.name = String.format("%s, %s", uniqueName, Utils.bytesToString(capacity)); - } - this.spiller = spiller; - this.virtualChildren = virtualChildren; - } - - @Override - public long borrow(long size) { - if (size == 0) { - return 0; - } - ensureFreeCapacity(size); - return borrow0(Math.min(freeBytes(), size)); - } - - private long freeBytes() { - return capacity - usedBytes(); - } - - private long borrow0(long size) { - long granted = parent.borrow(size); - selfRecorder.inc(granted); - return granted; - } - - @Override - public Spiller getNodeSpiller() { - return spiller; - } - - private boolean ensureFreeCapacity(long bytesNeeded) { - while (true) { // FIXME should we add retry limit? - long freeBytes = freeBytes(); - Preconditions.checkState(freeBytes >= 0); - if (freeBytes >= bytesNeeded) { - // free bytes fit requirement - return true; - } - // spill - long bytesToSpill = bytesNeeded - freeBytes; - long spilledBytes = TreeMemoryTargets.spillTree(this, bytesToSpill); - Preconditions.checkState(spilledBytes >= 0); - if (spilledBytes == 0) { - // OOM - return false; - } - } - } - - @Override - public long repay(long size) { - if (size == 0) { - return 0; - } - long toFree = Math.min(usedBytes(), size); - long freed = parent.repay(toFree); - selfRecorder.inc(-freed); - return freed; - } - - @Override - public long usedBytes() { - return selfRecorder.current(); - } - - @Override - public T accept(MemoryTargetVisitor visitor) { - return visitor.visit(this); - } - - @Override - public String name() { - return name; - } - - @Override - public MemoryUsageStats stats() { - final Map childrenStats = - new HashMap<>( - children.entrySet().stream() - .collect(Collectors.toMap(e -> e.getValue().name(), e -> e.getValue().stats()))); - - Preconditions.checkState(childrenStats.size() == children.size()); - - // add virtual children - for (Map.Entry entry : virtualChildren.entrySet()) { - if (childrenStats.containsKey(entry.getKey())) { - throw new IllegalArgumentException("Child stats already exists: " + entry.getKey()); - } - childrenStats.put(entry.getKey(), entry.getValue().toStats()); - } - return selfRecorder.toStats(childrenStats); - } - - @Override - public TreeMemoryTarget newChild( - String name, - long capacity, - Spiller spiller, - Map virtualChildren) { - final Node child = new Node(this, name, capacity, spiller, virtualChildren); - if (children.containsKey(child.name())) { - throw new IllegalArgumentException("Child already registered: " + child.name()); - } - children.put(child.name(), child); - return child; - } - - @Override - public Map children() { - return Collections.unmodifiableMap(children); - } - - @Override - public TreeMemoryTarget parent() { - return parent; - } - } } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java index 44c725798c75..38ac7d9733b6 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumer.java @@ -25,6 +25,7 @@ import org.apache.spark.memory.MemoryConsumer; import org.apache.spark.memory.MemoryMode; import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.util.Utils; import java.io.IOException; import java.util.Collections; @@ -121,8 +122,7 @@ public TreeMemoryTarget newChild( long capacity, Spiller spiller, Map virtualChildren) { - final TreeMemoryTarget child = - TreeMemoryTargets.newChild(this, name, capacity, spiller, virtualChildren); + final TreeMemoryTarget child = new Node(this, name, capacity, spiller, virtualChildren); if (children.containsKey(child.name())) { throw new IllegalArgumentException("Child already registered: " + child.name()); } @@ -138,7 +138,8 @@ public Map children() { @Override public TreeMemoryTarget parent() { // we are root - throw new IllegalStateException("Unreachable code"); + throw new IllegalStateException( + "Unreachable code org.apache.gluten.memory.memtarget.spark.TreeMemoryConsumer.parent"); } @Override @@ -150,4 +151,145 @@ public Spiller getNodeSpiller() { public TaskMemoryManager getTaskMemoryManager() { return taskMemoryManager; } + + public static class Node implements TreeMemoryTarget, KnownNameAndStats { + private final Map children = new HashMap<>(); + private final TreeMemoryTarget parent; + private final String name; + private final long capacity; + private final Spiller spiller; + private final Map virtualChildren; + private final SimpleMemoryUsageRecorder selfRecorder = new SimpleMemoryUsageRecorder(); + + private Node( + TreeMemoryTarget parent, + String name, + long capacity, + Spiller spiller, + Map virtualChildren) { + this.parent = parent; + this.capacity = capacity; + final String uniqueName = MemoryTargetUtil.toUniqueName(name); + if (capacity == TreeMemoryTarget.CAPACITY_UNLIMITED) { + this.name = uniqueName; + } else { + this.name = String.format("%s, %s", uniqueName, Utils.bytesToString(capacity)); + } + this.spiller = spiller; + this.virtualChildren = virtualChildren; + } + + @Override + public long borrow(long size) { + if (size == 0) { + return 0; + } + ensureFreeCapacity(size); + return borrow0(Math.min(freeBytes(), size)); + } + + private long freeBytes() { + return capacity - usedBytes(); + } + + private long borrow0(long size) { + long granted = parent.borrow(size); + selfRecorder.inc(granted); + return granted; + } + + @Override + public Spiller getNodeSpiller() { + return spiller; + } + + private boolean ensureFreeCapacity(long bytesNeeded) { + while (true) { // FIXME should we add retry limit? + long freeBytes = freeBytes(); + Preconditions.checkState(freeBytes >= 0); + if (freeBytes >= bytesNeeded) { + // free bytes fit requirement + return true; + } + // spill + long bytesToSpill = bytesNeeded - freeBytes; + long spilledBytes = TreeMemoryTargets.spillTree(this, bytesToSpill); + Preconditions.checkState(spilledBytes >= 0); + if (spilledBytes == 0) { + // OOM + return false; + } + } + } + + @Override + public long repay(long size) { + if (size == 0) { + return 0; + } + long toFree = Math.min(usedBytes(), size); + long freed = parent.repay(toFree); + selfRecorder.inc(-freed); + return freed; + } + + @Override + public long usedBytes() { + return selfRecorder.current(); + } + + @Override + public T accept(MemoryTargetVisitor visitor) { + return visitor.visit(this); + } + + @Override + public String name() { + return name; + } + + @Override + public MemoryUsageStats stats() { + final Map childrenStats = + new HashMap<>( + children.entrySet().stream() + .collect(Collectors.toMap(e -> e.getValue().name(), e -> e.getValue().stats()))); + + Preconditions.checkState(childrenStats.size() == children.size()); + + // add virtual children + for (Map.Entry entry : virtualChildren.entrySet()) { + if (childrenStats.containsKey(entry.getKey())) { + throw new IllegalArgumentException("Child stats already exists: " + entry.getKey()); + } + childrenStats.put(entry.getKey(), entry.getValue().toStats()); + } + return selfRecorder.toStats(childrenStats); + } + + @Override + public TreeMemoryTarget newChild( + String name, + long capacity, + Spiller spiller, + Map virtualChildren) { + final Node child = + new Node(this, name, Math.min(this.capacity, capacity), spiller, virtualChildren); + if (children.containsKey(child.name())) { + throw new IllegalArgumentException("Child already registered: " + child.name()); + } + children.put(child.name(), child); + return child; + } + + @Override + public Map children() { + return Collections.unmodifiableMap(children); + } + + @Override + public TreeMemoryTarget parent() { + return parent; + } + } } diff --git a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java index 7ab05bd3a2e7..a11a4a3e4a19 100644 --- a/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java +++ b/gluten-core/src/main/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumers.java @@ -17,78 +17,67 @@ package org.apache.gluten.memory.memtarget.spark; import org.apache.gluten.GlutenConfig; -import org.apache.gluten.memory.MemoryUsageStatsBuilder; -import org.apache.gluten.memory.memtarget.Spiller; import org.apache.gluten.memory.memtarget.Spillers; import org.apache.gluten.memory.memtarget.TreeMemoryTarget; import org.apache.commons.collections.map.ReferenceMap; import org.apache.spark.memory.TaskMemoryManager; +import org.apache.spark.util.Utils; import java.util.Collections; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; public final class TreeMemoryConsumers { - private static final Map FACTORIES = new ConcurrentHashMap<>(); + private static final ReferenceMap FACTORIES = new ReferenceMap(); private TreeMemoryConsumers() {} - private static Factory createOrGetFactory(long perTaskCapacity) { - return FACTORIES.computeIfAbsent(perTaskCapacity, Factory::new); - } - - /** - * A hub to provide memory target instances whose shared size (in the same task) is limited to X, - * X = executor memory / task slots. - * - *

Using this to prevent OOMs if the delegated memory target could possibly hold large memory - * blocks that are not spillable. - * - *

See GLUTEN-3030 - */ - public static Factory isolated() { - return createOrGetFactory(GlutenConfig.getConf().conservativeTaskOffHeapMemorySize()); - } - - /** - * This works as a legacy Spark memory consumer which grants as much as possible of memory - * capacity to each task. - */ - public static Factory shared() { - return createOrGetFactory(TreeMemoryTarget.CAPACITY_UNLIMITED); + @SuppressWarnings("unchecked") + public static Factory factory(TaskMemoryManager tmm) { + synchronized (FACTORIES) { + return (Factory) FACTORIES.computeIfAbsent(tmm, m -> new Factory((TaskMemoryManager) m)); + } } public static class Factory { - private final ReferenceMap map = new ReferenceMap(ReferenceMap.WEAK, ReferenceMap.WEAK); - private final long perTaskCapacity; + private final TreeMemoryConsumer sparkConsumer; + private final Map roots = new ConcurrentHashMap<>(); + + private Factory(TaskMemoryManager tmm) { + this.sparkConsumer = new TreeMemoryConsumer(tmm); + } - private Factory(long perTaskCapacity) { - this.perTaskCapacity = perTaskCapacity; + private TreeMemoryTarget ofCapacity(long capacity) { + return roots.computeIfAbsent( + capacity, + cap -> + sparkConsumer.newChild( + String.format("Capacity[%s]", Utils.bytesToString(cap)), + cap, + Spillers.NOOP, + Collections.emptyMap())); } - @SuppressWarnings("unchecked") - private TreeMemoryTarget getSharedAccount(TaskMemoryManager tmm) { - synchronized (map) { - return (TreeMemoryTarget) - map.computeIfAbsent( - tmm, - m -> { - TreeMemoryTarget tmc = new TreeMemoryConsumer((TaskMemoryManager) m); - return tmc.newChild( - "root", perTaskCapacity, Spillers.NOOP, Collections.emptyMap()); - }); - } + /** + * This works as a legacy Spark memory consumer which grants as much as possible of memory + * capacity to each task. + */ + public TreeMemoryTarget legacyRoot() { + return ofCapacity(TreeMemoryTarget.CAPACITY_UNLIMITED); } - public TreeMemoryTarget newConsumer( - TaskMemoryManager tmm, - String name, - Spiller spiller, - Map virtualChildren) { - final TreeMemoryTarget account = getSharedAccount(tmm); - return account.newChild( - name, TreeMemoryConsumer.CAPACITY_UNLIMITED, spiller, virtualChildren); + /** + * A hub to provide memory target instances whose shared size (in the same task) is limited to + * X, X = executor memory / task slots. + * + *

Using this to prevent OOMs if the delegated memory target could possibly hold large memory + * blocks that are not spill-able. + * + *

See GLUTEN-3030 + */ + public TreeMemoryTarget isolatedRoot() { + return ofCapacity(GlutenConfig.getConf().conservativeTaskOffHeapMemorySize()); } } } diff --git a/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala b/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala index d221fafce418..338854cf086c 100644 --- a/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala +++ b/gluten-core/src/main/scala/org/apache/spark/memory/SparkMemoryUtil.scala @@ -111,7 +111,7 @@ object SparkMemoryUtil { collectFromTaskMemoryManager(treeMemoryConsumer.getTaskMemoryManager) } - override def visit(node: TreeMemoryTargets.Node): String = { + override def visit(node: TreeMemoryConsumer.Node): String = { node.parent().accept(this) // walk up to find the one bound with task memory manager } @@ -131,6 +131,10 @@ object SparkMemoryUtil { dynamicOffHeapSizingMemoryTarget: DynamicOffHeapSizingMemoryTarget): String = { dynamicOffHeapSizingMemoryTarget.delegated().accept(this) } + + override def visit(retryOnOomMemoryTarget: RetryOnOomMemoryTarget): String = { + retryOnOomMemoryTarget.target().accept(this) + } }) } diff --git a/gluten-core/src/main/scala/org/apache/spark/task/TaskResources.scala b/gluten-core/src/main/scala/org/apache/spark/task/TaskResources.scala index b061aa332c74..2f609b026db3 100644 --- a/gluten-core/src/main/scala/org/apache/spark/task/TaskResources.scala +++ b/gluten-core/src/main/scala/org/apache/spark/task/TaskResources.scala @@ -16,6 +16,7 @@ */ package org.apache.spark.task +import org.apache.gluten.GlutenConfig import org.apache.gluten.task.TaskListener import org.apache.spark.{TaskContext, TaskFailedReason, TaskKilledException, UnknownReason} @@ -65,8 +66,8 @@ object TaskResources extends TaskListener with Logging { properties.put(key, value) case _ => } - properties.setIfMissing("spark.memory.offHeap.enabled", "true") - properties.setIfMissing("spark.memory.offHeap.size", "1TB") + properties.setIfMissing(GlutenConfig.SPARK_OFFHEAP_ENABLED, "true") + properties.setIfMissing(GlutenConfig.SPARK_OFFHEAP_SIZE_KEY, "1TB") TaskContext.setTaskContext(newUnsafeTaskContext(properties)) } @@ -298,9 +299,14 @@ class TaskResourceRegistry extends Logging { o1: util.Map.Entry[Int, util.LinkedHashSet[TaskResource]], o2: util.Map.Entry[Int, util.LinkedHashSet[TaskResource]]) => { val diff = o2.getKey - o1.getKey // descending by priority - if (diff > 0) 1 - else if (diff < 0) -1 - else throw new IllegalStateException("Unreachable code") + if (diff > 0) { + 1 + } else if (diff < 0) { + -1 + } else { + throw new IllegalStateException( + "Unreachable code from org.apache.spark.task.TaskResourceRegistry.releaseAll") + } } ) table.forEach { diff --git a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java index befe449186e7..6cb38fe8d5d3 100644 --- a/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java +++ b/gluten-core/src/test/java/org/apache/gluten/memory/memtarget/spark/TreeMemoryConsumerTest.java @@ -49,13 +49,16 @@ public void setUp() throws Exception { public void testIsolated() { test( () -> { - final TreeMemoryConsumers.Factory factory = TreeMemoryConsumers.isolated(); + final TreeMemoryConsumers.Factory factory = + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()); final TreeMemoryTarget consumer = - factory.newConsumer( - TaskContext.get().taskMemoryManager(), - "FOO", - Spillers.NOOP, - Collections.emptyMap()); + factory + .isolatedRoot() + .newChild( + "FOO", + TreeMemoryTarget.CAPACITY_UNLIMITED, + Spillers.NOOP, + Collections.emptyMap()); Assert.assertEquals(20, consumer.borrow(20)); Assert.assertEquals(70, consumer.borrow(70)); Assert.assertEquals(10, consumer.borrow(20)); @@ -64,16 +67,19 @@ public void testIsolated() { } @Test - public void testShared() { + public void testLegacy() { test( () -> { - final TreeMemoryConsumers.Factory factory = TreeMemoryConsumers.shared(); + final TreeMemoryConsumers.Factory factory = + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()); final TreeMemoryTarget consumer = - factory.newConsumer( - TaskContext.get().taskMemoryManager(), - "FOO", - Spillers.NOOP, - Collections.emptyMap()); + factory + .legacyRoot() + .newChild( + "FOO", + TreeMemoryTarget.CAPACITY_UNLIMITED, + Spillers.NOOP, + Collections.emptyMap()); Assert.assertEquals(20, consumer.borrow(20)); Assert.assertEquals(70, consumer.borrow(70)); Assert.assertEquals(20, consumer.borrow(20)); @@ -82,22 +88,24 @@ public void testShared() { } @Test - public void testIsolatedAndShared() { + public void testIsolatedAndLegacy() { test( () -> { - final TreeMemoryTarget shared = - TreeMemoryConsumers.shared() - .newConsumer( - TaskContext.get().taskMemoryManager(), + final TreeMemoryTarget legacy = + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()) + .legacyRoot() + .newChild( "FOO", + TreeMemoryTarget.CAPACITY_UNLIMITED, Spillers.NOOP, Collections.emptyMap()); - Assert.assertEquals(110, shared.borrow(110)); + Assert.assertEquals(110, legacy.borrow(110)); final TreeMemoryTarget isolated = - TreeMemoryConsumers.isolated() - .newConsumer( - TaskContext.get().taskMemoryManager(), + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()) + .isolatedRoot() + .newChild( "FOO", + TreeMemoryTarget.CAPACITY_UNLIMITED, Spillers.NOOP, Collections.emptyMap()); Assert.assertEquals(100, isolated.borrow(110)); @@ -109,36 +117,34 @@ public void testSpill() { test( () -> { final Spillers.AppendableSpillerList spillers = Spillers.appendable(); - final TreeMemoryTarget shared = - TreeMemoryConsumers.shared() - .newConsumer( - TaskContext.get().taskMemoryManager(), - "FOO", - spillers, - Collections.emptyMap()); + final TreeMemoryTarget legacy = + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()) + .legacyRoot() + .newChild( + "FOO", TreeMemoryTarget.CAPACITY_UNLIMITED, spillers, Collections.emptyMap()); final AtomicInteger numSpills = new AtomicInteger(0); final AtomicLong numSpilledBytes = new AtomicLong(0L); spillers.append( new Spiller() { @Override public long spill(MemoryTarget self, Phase phase, long size) { - long repaid = shared.repay(size); + long repaid = legacy.repay(size); numSpills.getAndIncrement(); numSpilledBytes.getAndAdd(repaid); return repaid; } }); - Assert.assertEquals(300, shared.borrow(300)); - Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); Assert.assertEquals(1, numSpills.get()); Assert.assertEquals(200, numSpilledBytes.get()); - Assert.assertEquals(400, shared.usedBytes()); + Assert.assertEquals(400, legacy.usedBytes()); - Assert.assertEquals(300, shared.borrow(300)); - Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); Assert.assertEquals(3, numSpills.get()); Assert.assertEquals(800, numSpilledBytes.get()); - Assert.assertEquals(400, shared.usedBytes()); + Assert.assertEquals(400, legacy.usedBytes()); }); } @@ -147,36 +153,34 @@ public void testOverSpill() { test( () -> { final Spillers.AppendableSpillerList spillers = Spillers.appendable(); - final TreeMemoryTarget shared = - TreeMemoryConsumers.shared() - .newConsumer( - TaskContext.get().taskMemoryManager(), - "FOO", - spillers, - Collections.emptyMap()); + final TreeMemoryTarget legacy = + TreeMemoryConsumers.factory(TaskContext.get().taskMemoryManager()) + .legacyRoot() + .newChild( + "FOO", TreeMemoryTarget.CAPACITY_UNLIMITED, spillers, Collections.emptyMap()); final AtomicInteger numSpills = new AtomicInteger(0); final AtomicLong numSpilledBytes = new AtomicLong(0L); spillers.append( new Spiller() { @Override public long spill(MemoryTarget self, Phase phase, long size) { - long repaid = shared.repay(Long.MAX_VALUE); + long repaid = legacy.repay(Long.MAX_VALUE); numSpills.getAndIncrement(); numSpilledBytes.getAndAdd(repaid); return repaid; } }); - Assert.assertEquals(300, shared.borrow(300)); - Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); Assert.assertEquals(1, numSpills.get()); Assert.assertEquals(300, numSpilledBytes.get()); - Assert.assertEquals(300, shared.usedBytes()); + Assert.assertEquals(300, legacy.usedBytes()); - Assert.assertEquals(300, shared.borrow(300)); - Assert.assertEquals(300, shared.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); + Assert.assertEquals(300, legacy.borrow(300)); Assert.assertEquals(3, numSpills.get()); Assert.assertEquals(900, numSpilledBytes.get()); - Assert.assertEquals(300, shared.usedBytes()); + Assert.assertEquals(300, legacy.usedBytes()); }); } diff --git a/gluten-hudi/pom.xml b/gluten-hudi/pom.xml index 7900182f853a..5865f1f6ece8 100755 --- a/gluten-hudi/pom.xml +++ b/gluten-hudi/pom.xml @@ -46,19 +46,6 @@ test-jar test - - org.apache.gluten - backends-velox - ${project.version} - test - - - org.apache.gluten - backends-velox - ${project.version} - test-jar - test - org.apache.spark spark-core_${scala.binary.version} diff --git a/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxHudiSuite.scala b/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/HudiSuite.scala similarity index 98% rename from gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxHudiSuite.scala rename to gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/HudiSuite.scala index b760ec556535..97633fa064cc 100644 --- a/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/VeloxHudiSuite.scala +++ b/gluten-hudi/src-hudi/test/scala/org/apache/gluten/execution/HudiSuite.scala @@ -19,7 +19,7 @@ package org.apache.gluten.execution import org.apache.spark.SparkConf import org.apache.spark.sql.Row -class VeloxHudiSuite extends WholeStageTransformerSuite { +abstract class HudiSuite extends WholeStageTransformerSuite { protected val rootPath: String = getClass.getResource("/").getPath override protected val resourcePath: String = "/tpch-data-parquet" diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala index 69cea9c5470d..984450bf164e 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/TransformerApi.scala @@ -16,12 +16,13 @@ */ package org.apache.gluten.backendsapi +import org.apache.gluten.execution.WriteFilesExecTransformer import org.apache.gluten.substrait.expression.ExpressionNode import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.connector.read.InputPartition import org.apache.spark.sql.execution.SparkPlan -import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation, PartitionDirectory} +import org.apache.spark.sql.execution.datasources.{HadoopFsRelation, PartitionDirectory} import org.apache.spark.sql.types.{DataType, DecimalType, StructType} import org.apache.spark.util.collection.BitSet @@ -75,7 +76,7 @@ trait TransformerApi { /** This method is only used for CH backend tests */ def invalidateSQLExecutionResource(executionId: String): Unit = {} - def genWriteParameters(fileFormat: FileFormat, writeOptions: Map[String, String]): Any + def genWriteParameters(write: WriteFilesExecTransformer): Any /** use Hadoop Path class to encode the file path */ def encodeFilePathIfNeed(filePath: String): String = filePath diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala b/gluten-substrait/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala index a9d3a6282ae1..726dbdc3ef30 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/execution/WriteFilesExecTransformer.scala @@ -67,7 +67,7 @@ case class WriteFilesExecTransformer( override def output: Seq[Attribute] = Seq.empty - private val caseInsensitiveOptions = CaseInsensitiveMap(options) + val caseInsensitiveOptions: CaseInsensitiveMap[String] = CaseInsensitiveMap(options) def getRelNode( context: SubstraitContext, @@ -99,8 +99,7 @@ case class WriteFilesExecTransformer( ConverterUtils.collectAttributeNames(inputAttributes.toSeq) val extensionNode = if (!validation) { ExtensionBuilder.makeAdvancedExtension( - BackendsApiManager.getTransformerApiInstance - .genWriteParameters(fileFormat, caseInsensitiveOptions), + BackendsApiManager.getTransformerApiInstance.genWriteParameters(this), SubstraitUtil.createEnhancement(originalInputAttributes) ) } else { diff --git a/gluten-substrait/src/main/scala/org/apache/spark/sql/execution/ApplyResourceProfileExec.scala b/gluten-substrait/src/main/scala/org/apache/spark/sql/execution/ApplyResourceProfileExec.scala new file mode 100644 index 000000000000..17640f461213 --- /dev/null +++ b/gluten-substrait/src/main/scala/org/apache/spark/sql/execution/ApplyResourceProfileExec.scala @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.execution + +import org.apache.gluten.execution.GlutenPlan +import org.apache.gluten.extension.columnar.transition.Convention + +import org.apache.spark.annotation.Experimental +import org.apache.spark.rdd.RDD +import org.apache.spark.resource.ResourceProfile +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.{Attribute, SortOrder} +import org.apache.spark.sql.catalyst.plans.physical.{Distribution, Partitioning} +import org.apache.spark.sql.vectorized.ColumnarBatch + +/** + * Used to apply specified resource profile for the whole stage. + * @param child + * @param resourceProfile + * resource profile specified for child belong stage. + */ +@Experimental +case class ApplyResourceProfileExec(child: SparkPlan, resourceProfile: ResourceProfile) + extends UnaryExecNode + with GlutenPlan { + + override def batchType(): Convention.BatchType = { + Convention.get(child).batchType + } + + override def rowType0(): Convention.RowType = { + Convention.get(child).rowType + } + + override def outputPartitioning: Partitioning = { + child.outputPartitioning + } + + override def requiredChildDistribution: scala.Seq[Distribution] = { + child.requiredChildDistribution + } + + override def outputOrdering: scala.Seq[SortOrder] = { + child.outputOrdering + } + + override def requiredChildOrdering: scala.Seq[scala.Seq[SortOrder]] = { + child.requiredChildOrdering + } + + override protected def doExecute(): RDD[InternalRow] = { + log.info(s"Apply $resourceProfile for plan ${child.nodeName}") + child.execute.withResources(resourceProfile) + } + + override protected def doExecuteColumnar(): RDD[ColumnarBatch] = { + log.info(s"Apply $resourceProfile for columnar plan ${child.nodeName}") + child.executeColumnar.withResources(resourceProfile) + } + + override def output: scala.Seq[Attribute] = child.output + + override protected def withNewChildInternal(newChild: SparkPlan): ApplyResourceProfileExec = + copy(child = newChild) +} diff --git a/gluten-uniffle/.gitkeep b/gluten-uniffle/.gitkeep new file mode 100644 index 000000000000..f2d1254d2735 --- /dev/null +++ b/gluten-uniffle/.gitkeep @@ -0,0 +1 @@ +The module is kept for adding common code shared by backends for Uniffle support in Gluten. diff --git a/gluten-uniffle/package/pom.xml b/gluten-uniffle/package/pom.xml deleted file mode 100644 index e49748e7c8e9..000000000000 --- a/gluten-uniffle/package/pom.xml +++ /dev/null @@ -1,29 +0,0 @@ - - - - gluten-uniffle - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gluten-uniffle-package - jar - Gluten Uniffle Package - - - - backends-velox - - - org.apache.gluten - gluten-uniffle-velox - ${project.version} - - - - - diff --git a/gluten-uniffle/pom.xml b/gluten-uniffle/pom.xml index b7fe4c2e4268..efc8ce6555c5 100644 --- a/gluten-uniffle/pom.xml +++ b/gluten-uniffle/pom.xml @@ -11,7 +11,7 @@ 4.0.0 gluten-uniffle - pom + jar Gluten Uniffle @@ -75,15 +75,4 @@ - - - backends-velox - - - - velox - package - - - diff --git a/gluten-uniffle/velox/pom.xml b/gluten-uniffle/velox/pom.xml deleted file mode 100755 index ab730674fbb3..000000000000 --- a/gluten-uniffle/velox/pom.xml +++ /dev/null @@ -1,62 +0,0 @@ - - - - gluten-uniffle - org.apache.gluten - 1.3.0-SNAPSHOT - ../pom.xml - - 4.0.0 - - gluten-uniffle-velox - jar - Gluten Uniffle Velox - - - - org.apache.gluten - backends-velox - ${project.version} - provided - - - org.apache.gluten - gluten-arrow - ${project.version} - provided - - - - - target/scala-${scala.binary.version}/classes - target/scala-${scala.binary.version}/test-classes - - - net.alchim31.maven - scala-maven-plugin - - - org.apache.maven.plugins - maven-compiler-plugin - - - org.scalastyle - scalastyle-maven-plugin - - - org.apache.maven.plugins - maven-checkstyle-plugin - - - com.diffplug.spotless - spotless-maven-plugin - - - org.apache.maven.plugins - maven-jar-plugin - - - - diff --git a/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala b/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala index dce8ac83710c..51e8174da7fb 100644 --- a/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala +++ b/gluten-ut/common/src/test/scala/org/apache/gluten/utils/BackendTestSettings.scala @@ -80,7 +80,8 @@ abstract class BackendTestSettings { return !isExcluded } - throw new IllegalStateException("Unreachable code") + throw new IllegalStateException( + "Unreachable code from org.apache.gluten.utils.BackendTestSettings.shouldRun") } final protected class SuiteSettings { diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala index 36d5b5177c6b..16879489d29e 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/clickhouse/ClickHouseTestSettings.scala @@ -286,6 +286,7 @@ class ClickHouseTestSettings extends BackendTestSettings { .exclude("groupBy.as") enableSuite[GlutenDateFunctionsSuite] .exclude("function to_date") + .excludeGlutenTest("function to_date") .exclude("unix_timestamp") .exclude("to_unix_timestamp") .exclude("to_timestamp") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 15495270a189..2c6b882850c4 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -265,6 +265,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") + // Legacy mode is not supported and velox getTimestamp function does not throw + // exception when format is "yyyy-dd-aa". + .exclude("function to_date") enableSuite[GlutenDataFrameFunctionsSuite] // blocked by Velox-5768 .exclude("aggregate function - array for primitive type containing null") diff --git a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index 8d1f7320dd42..5ddfe6fc1ff3 100644 --- a/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark32/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -248,4 +248,93 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } + + testGluten("function to_date") { + val d1 = Date.valueOf("2015-07-22") + val d2 = Date.valueOf("2015-07-01") + val d3 = Date.valueOf("2014-12-31") + val t1 = Timestamp.valueOf("2015-07-22 10:00:00") + val t2 = Timestamp.valueOf("2014-12-31 23:59:59") + val t3 = Timestamp.valueOf("2014-12-31 23:59:59") + val s1 = "2015-07-22 10:00:00" + val s2 = "2014-12-31" + val s3 = "2014-31-12" + val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s") + + checkAnswer( + df.select(to_date(col("t"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("s"))), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + checkAnswer( + df.selectExpr("to_date(t)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(d)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(s)"), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + // now with format + checkAnswer( + df.select(to_date(col("t"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + withSQLConf(confKey -> "corrected") { + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + } + // legacyParserPolicy is not respected by Gluten. + // withSQLConf(confKey -> "exception") { + // checkExceptionMessage(df.select(to_date(col("s"), "yyyy-MM-dd"))) + // } + + // now switch format + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-MM")), + Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31")))) + + // invalid format + checkAnswer(df.select(to_date(col("s"), "yyyy-hh-MM")), Seq(Row(null), Row(null), Row(null))) + // velox getTimestamp function does not throw exception when format is "yyyy-dd-aa". + // val e = + // intercept[SparkUpgradeException](df.select(to_date(col("s"), "yyyy-dd-aa")).collect()) + // assert(e.getCause.isInstanceOf[IllegalArgumentException]) + // assert( + // e.getMessage.contains("You may get a different result due to the upgrading to Spark")) + + // February + val x1 = "2016-02-29" + val x2 = "2017-02-29" + val df1 = Seq(x1, x2).toDF("x") + checkAnswer(df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil) + } } diff --git a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index 407b9c8b95cc..f83b91ede1cc 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -1084,6 +1084,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") + // Legacy mode is not supported and velox getTimestamp function does not throw + // exception when format is "yyyy-dd-aa". + .exclude("function to_date") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] diff --git a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index a946e6de4345..ae86c9d06e81 100644 --- a/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark33/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,4 +246,93 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } + + testGluten("function to_date") { + val d1 = Date.valueOf("2015-07-22") + val d2 = Date.valueOf("2015-07-01") + val d3 = Date.valueOf("2014-12-31") + val t1 = Timestamp.valueOf("2015-07-22 10:00:00") + val t2 = Timestamp.valueOf("2014-12-31 23:59:59") + val t3 = Timestamp.valueOf("2014-12-31 23:59:59") + val s1 = "2015-07-22 10:00:00" + val s2 = "2014-12-31" + val s3 = "2014-31-12" + val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s") + + checkAnswer( + df.select(to_date(col("t"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("s"))), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + checkAnswer( + df.selectExpr("to_date(t)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(d)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(s)"), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + // now with format + checkAnswer( + df.select(to_date(col("t"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + withSQLConf(confKey -> "corrected") { + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + } + // legacyParserPolicy is not respected by Gluten. + // withSQLConf(confKey -> "exception") { + // checkExceptionMessage(df.select(to_date(col("s"), "yyyy-MM-dd"))) + // } + + // now switch format + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-MM")), + Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31")))) + + // invalid format + checkAnswer(df.select(to_date(col("s"), "yyyy-hh-MM")), Seq(Row(null), Row(null), Row(null))) + // velox getTimestamp function does not throw exception when format is "yyyy-dd-aa". + // val e = + // intercept[SparkUpgradeException](df.select(to_date(col("s"), "yyyy-dd-aa")).collect()) + // assert(e.getCause.isInstanceOf[IllegalArgumentException]) + // assert( + // e.getMessage.contains("You may get a different result due to the upgrading to Spark")) + + // February + val x1 = "2016-02-29" + val x2 = "2017-02-29" + val df1 = Seq(x1, x2).toDF("x") + checkAnswer(df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil) + } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index dbb01fbe7067..b0446d3ca7b6 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -1101,6 +1101,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") + // Legacy mode is not supported and velox getTimestamp function does not throw + // exception when format is "yyyy-dd-aa". + .exclude("function to_date") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index a946e6de4345..ae86c9d06e81 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,4 +246,93 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } + + testGluten("function to_date") { + val d1 = Date.valueOf("2015-07-22") + val d2 = Date.valueOf("2015-07-01") + val d3 = Date.valueOf("2014-12-31") + val t1 = Timestamp.valueOf("2015-07-22 10:00:00") + val t2 = Timestamp.valueOf("2014-12-31 23:59:59") + val t3 = Timestamp.valueOf("2014-12-31 23:59:59") + val s1 = "2015-07-22 10:00:00" + val s2 = "2014-12-31" + val s3 = "2014-31-12" + val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s") + + checkAnswer( + df.select(to_date(col("t"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("s"))), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + checkAnswer( + df.selectExpr("to_date(t)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(d)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(s)"), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + // now with format + checkAnswer( + df.select(to_date(col("t"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + withSQLConf(confKey -> "corrected") { + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + } + // legacyParserPolicy is not respected by Gluten. + // withSQLConf(confKey -> "exception") { + // checkExceptionMessage(df.select(to_date(col("s"), "yyyy-MM-dd"))) + // } + + // now switch format + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-MM")), + Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31")))) + + // invalid format + checkAnswer(df.select(to_date(col("s"), "yyyy-hh-MM")), Seq(Row(null), Row(null), Row(null))) + // velox getTimestamp function does not throw exception when format is "yyyy-dd-aa". + // val e = + // intercept[SparkUpgradeException](df.select(to_date(col("s"), "yyyy-dd-aa")).collect()) + // assert(e.getCause.isInstanceOf[IllegalArgumentException]) + // assert( + // e.getMessage.contains("You may get a different result due to the upgrading to Spark")) + + // February + val x1 = "2016-02-29" + val x2 = "2017-02-29" + val df1 = Seq(x1, x2).toDF("x") + checkAnswer(df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil) + } } diff --git a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenCastSuite.scala b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenCastSuite.scala index b8ac906d8076..f2a83bf234a9 100644 --- a/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenCastSuite.scala +++ b/gluten-ut/spark34/src/test/scala/org/apache/spark/sql/catalyst/expressions/GlutenCastSuite.scala @@ -40,15 +40,12 @@ class GlutenCastSuite extends CastSuiteBase with GlutenTestsTrait { testGluten("missing cases - from boolean") { (DataTypeTestUtils.numericTypeWithoutDecimal + BooleanType).foreach { - t => - t match { - case BooleanType => - checkEvaluation(cast(cast(true, BooleanType), t), true) - checkEvaluation(cast(cast(false, BooleanType), t), false) - case _ => - checkEvaluation(cast(cast(true, BooleanType), t), 1) - checkEvaluation(cast(cast(false, BooleanType), t), 0) - } + case t @ BooleanType => + checkEvaluation(cast(cast(true, BooleanType), t), true) + checkEvaluation(cast(cast(false, BooleanType), t), false) + case t => + checkEvaluation(cast(cast(true, BooleanType), t), 1) + checkEvaluation(cast(cast(false, BooleanType), t), 0) } } diff --git a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala index f5a1a076956e..a01d0cb4b331 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/gluten/utils/velox/VeloxTestSettings.scala @@ -1123,6 +1123,9 @@ class VeloxTestSettings extends BackendTestSettings { .exclude("to_timestamp") // Legacy mode is not supported, assuming this mode is not commonly used. .exclude("SPARK-30668: use legacy timestamp parser in to_timestamp") + // Legacy mode is not supported and velox getTimestamp function does not throw + // exception when format is "yyyy-dd-aa". + .exclude("function to_date") enableSuite[GlutenDeprecatedAPISuite] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOff] enableSuite[GlutenDynamicPartitionPruningV1SuiteAEOn] diff --git a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala index a946e6de4345..ae86c9d06e81 100644 --- a/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala +++ b/gluten-ut/spark35/src/test/scala/org/apache/spark/sql/GlutenDateFunctionsSuite.scala @@ -246,4 +246,93 @@ class GlutenDateFunctionsSuite extends DateFunctionsSuite with GlutenSQLTestsTra } } } + + testGluten("function to_date") { + val d1 = Date.valueOf("2015-07-22") + val d2 = Date.valueOf("2015-07-01") + val d3 = Date.valueOf("2014-12-31") + val t1 = Timestamp.valueOf("2015-07-22 10:00:00") + val t2 = Timestamp.valueOf("2014-12-31 23:59:59") + val t3 = Timestamp.valueOf("2014-12-31 23:59:59") + val s1 = "2015-07-22 10:00:00" + val s2 = "2014-12-31" + val s3 = "2014-31-12" + val df = Seq((d1, t1, s1), (d2, t2, s2), (d3, t3, s3)).toDF("d", "t", "s") + + checkAnswer( + df.select(to_date(col("t"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"))), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("s"))), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + checkAnswer( + df.selectExpr("to_date(t)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(d)"), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.selectExpr("to_date(s)"), + Seq(Row(Date.valueOf("2015-07-22")), Row(Date.valueOf("2014-12-31")), Row(null))) + + // now with format + checkAnswer( + df.select(to_date(col("t"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2014-12-31")), + Row(Date.valueOf("2014-12-31")))) + checkAnswer( + df.select(to_date(col("d"), "yyyy-MM-dd")), + Seq( + Row(Date.valueOf("2015-07-22")), + Row(Date.valueOf("2015-07-01")), + Row(Date.valueOf("2014-12-31")))) + val confKey = SQLConf.LEGACY_TIME_PARSER_POLICY.key + withSQLConf(confKey -> "corrected") { + checkAnswer( + df.select(to_date(col("s"), "yyyy-MM-dd")), + Seq(Row(null), Row(Date.valueOf("2014-12-31")), Row(null))) + } + // legacyParserPolicy is not respected by Gluten. + // withSQLConf(confKey -> "exception") { + // checkExceptionMessage(df.select(to_date(col("s"), "yyyy-MM-dd"))) + // } + + // now switch format + checkAnswer( + df.select(to_date(col("s"), "yyyy-dd-MM")), + Seq(Row(null), Row(null), Row(Date.valueOf("2014-12-31")))) + + // invalid format + checkAnswer(df.select(to_date(col("s"), "yyyy-hh-MM")), Seq(Row(null), Row(null), Row(null))) + // velox getTimestamp function does not throw exception when format is "yyyy-dd-aa". + // val e = + // intercept[SparkUpgradeException](df.select(to_date(col("s"), "yyyy-dd-aa")).collect()) + // assert(e.getCause.isInstanceOf[IllegalArgumentException]) + // assert( + // e.getMessage.contains("You may get a different result due to the upgrading to Spark")) + + // February + val x1 = "2016-02-29" + val x2 = "2017-02-29" + val df1 = Seq(x1, x2).toDF("x") + checkAnswer(df1.select(to_date(col("x"))), Row(Date.valueOf("2016-02-29")) :: Row(null) :: Nil) + } } diff --git a/package/pom.xml b/package/pom.xml index e0620e5cf5e1..b9c114181bcd 100644 --- a/package/pom.xml +++ b/package/pom.xml @@ -68,7 +68,7 @@ org.apache.gluten - gluten-celeborn-package + gluten-celeborn ${project.version} @@ -78,7 +78,7 @@ org.apache.gluten - gluten-uniffle-package + gluten-uniffle ${project.version} diff --git a/pom.xml b/pom.xml index 3c59b4f19e11..4d704dc9b448 100644 --- a/pom.xml +++ b/pom.xml @@ -422,6 +422,70 @@ gluten-celeborn + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-celeborn-sources + generate-sources + + add-source + + + + ${project.basedir}/src-celeborn/main/scala + ${project.basedir}/src-celeborn/main/java + + + + + add-celeborn-resources + generate-resources + + add-resource + + + + + ${project.basedir}/src-celeborn/main/resources + + + + + + add-celeborn-test-sources + generate-test-sources + + add-test-source + + + + ${project.basedir}/src-celeborn/test/scala + ${project.basedir}/src-celeborn/test/java + + + + + add-celeborn-test-resources + generate-test-resources + + add-test-resource + + + + + ${project.basedir}/src-celeborn/test/resources + + + + + + + + uniffle @@ -431,6 +495,70 @@ gluten-uniffle + + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-uniffle-sources + generate-sources + + add-source + + + + ${project.basedir}/src-uniffle/main/scala + ${project.basedir}/src-uniffle/main/java + + + + + add-uniffle-resources + generate-resources + + add-resource + + + + + ${project.basedir}/src-uniffle/main/resources + + + + + + add-uniffle-test-sources + generate-test-sources + + add-test-source + + + + ${project.basedir}/src-uniffle/test/scala + ${project.basedir}/src-uniffle/test/java + + + + + add-uniffle-test-resources + generate-test-resources + + add-test-resource + + + + + ${project.basedir}/src-uniffle/test/resources + + + + + + + + delta diff --git a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala index a3bd5079b016..fcdd3c3c8b4b 100644 --- a/shims/spark32/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala +++ b/shims/spark32/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala @@ -73,7 +73,9 @@ abstract class AbstractFileSourceScanExec( override def supportsColumnar: Boolean = { // The value should be defined in GlutenPlan. - throw new UnsupportedOperationException("Unreachable code") + throw new UnsupportedOperationException( + "Unreachable code from org.apache.spark.sql.execution.AbstractFileSourceScanExec" + + ".supportsColumnar") } private lazy val needsUnsafeRowConversion: Boolean = { diff --git a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala index c885f0cf44b3..01df5ba62167 100644 --- a/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala +++ b/shims/spark33/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala @@ -77,7 +77,9 @@ abstract class AbstractFileSourceScanExec( override def supportsColumnar: Boolean = { // The value should be defined in GlutenPlan. - throw new UnsupportedOperationException("Unreachable code") + throw new UnsupportedOperationException( + "Unreachable code from org.apache.spark.sql.execution.AbstractFileSourceScanExec" + + ".supportsColumnar") } private lazy val needsUnsafeRowConversion: Boolean = { diff --git a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala index 53ea6f543a95..15e54ddb71f2 100644 --- a/shims/spark34/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala +++ b/shims/spark34/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala @@ -69,7 +69,9 @@ abstract class AbstractFileSourceScanExec( override def supportsColumnar: Boolean = { // The value should be defined in GlutenPlan. - throw new UnsupportedOperationException("Unreachable code") + throw new UnsupportedOperationException( + "Unreachable code from org.apache.spark.sql.execution.AbstractFileSourceScanExec" + + ".supportsColumnar") } private lazy val needsUnsafeRowConversion: Boolean = { diff --git a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala index c8dbcc2fed4f..a83c763c4566 100644 --- a/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala +++ b/shims/spark35/src/main/scala/org/apache/spark/sql/execution/AbstractFileSourceScanExec.scala @@ -69,7 +69,9 @@ abstract class AbstractFileSourceScanExec( override def supportsColumnar: Boolean = { // The value should be defined in GlutenPlan. - throw new UnsupportedOperationException("Unreachable code") + throw new UnsupportedOperationException( + "Unreachable code from org.apache.spark.sql.execution.AbstractFileSourceScanExec" + + ".supportsColumnar") } private lazy val needsUnsafeRowConversion: Boolean = {