Skip to content

Commit

Permalink
Merge branch 'main' into c2r_oom
Browse files Browse the repository at this point in the history
  • Loading branch information
FelixYBW authored Jul 20, 2024
2 parents ce67f4c + ec8e5ea commit a565131
Show file tree
Hide file tree
Showing 57 changed files with 1,009 additions and 916 deletions.
37 changes: 10 additions & 27 deletions .github/workflows/velox_docker.yml
Original file line number Diff line number Diff line change
Expand Up @@ -63,13 +63,13 @@ jobs:
with:
path: |
./cpp/build/releases/
~/.m2/repository/org/apache/arrow/
key: cache-velox-build-${{ hashFiles('./cache-key') }}
- name: Build Gluten Velox third party
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }}
- name: Build Gluten native libraries
if: ${{ steps.cache.outputs.cache-hit != 'true' }}
run: |
df -a
source dev/ci-velox-buildstatic.sh
bash dev/ci-velox-buildstatic-centos-7.sh
- name: Upload Artifact Native
uses: actions/upload-artifact@v2
with:
Expand Down Expand Up @@ -597,32 +597,15 @@ jobs:
with:
path: |
./cpp/build/releases/
./cpp/build/velox/udf/examples/
./cpp/build/velox/benchmarks/
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }}
- name: Update mirror list
run: |
sed -i -e "s|mirrorlist=|#mirrorlist=|g" /etc/yum.repos.d/CentOS-* || true
sed -i -e "s|#baseurl=http://mirror.centos.org|baseurl=http://vault.centos.org|g" /etc/yum.repos.d/CentOS-* || true
- name: Setup build dependency
if: ${{ steps.cache.outputs.cache-hit != 'true' }}
- name: Build Gluten native libraries
if: steps.cache.outputs.cache-hit != 'true'
run: |
yum install sudo patch java-1.8.0-openjdk-devel wget -y
# Required by building arrow java.
wget https://downloads.apache.org/maven/maven-3/3.8.8/binaries/apache-maven-3.8.8-bin.tar.gz
tar -xvf apache-maven-3.8.8-bin.tar.gz && mv apache-maven-3.8.8 /usr/lib/maven
echo "PATH=${PATH}:/usr/lib/maven/bin" >> $GITHUB_ENV
- name: Build Gluten Velox third party
if: ${{ steps.cache.outputs.cache-hit != 'true' }}
run: |
source /opt/rh/gcc-toolset-9/enable
source ./dev/build_arrow.sh
install_arrow_deps
./dev/builddeps-veloxbe.sh --run_setup_script=OFF --enable_ep_cache=OFF --build_tests=ON \
--build_examples=ON --build_benchmarks=ON --build_protobuf=ON
- name: Gluten CPP Test
run: |
cd ./cpp/build && \
ctest -V
df -a
bash dev/ci-velox-buildshared-centos-8.sh
- uses: actions/upload-artifact@v2
with:
name: velox-native-lib-centos-8-${{github.sha}}
Expand Down
51 changes: 44 additions & 7 deletions .github/workflows/velox_docker_cache.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ concurrency:
cancel-in-progress: false

jobs:
cache-native-lib:
cache-native-lib-centos-7:
runs-on: ubuntu-20.04
container: apache/gluten:gluten-vcpkg-builder_2024_07_11 # centos7 with dependencies installed
steps:
Expand All @@ -44,11 +44,12 @@ jobs:
path: |
./cpp/build/releases/
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-${{ hashFiles('./cache-key') }}
- name: Build Gluten Velox third party
key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }}
- name: Build Gluten native libraries
if: steps.check-cache.outputs.cache-hit != 'true'
run: |
source dev/ci-velox-buildstatic.sh
df -a
bash dev/ci-velox-buildstatic-centos-7.sh
- name: Cache
if: steps.check-cache.outputs.cache-hit != 'true'
id: cache
Expand All @@ -57,7 +58,43 @@ jobs:
path: |
./cpp/build/releases/
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-${{ hashFiles('./cache-key') }}
key: cache-velox-build-centos-7-${{ hashFiles('./cache-key') }}

cache-native-lib-centos-8:
runs-on: ubuntu-20.04
container: ghcr.io/facebookincubator/velox-dev:centos8
steps:
- uses: actions/checkout@v2
- name: Generate cache key
run: |
echo ${{ hashFiles('./ep/build-velox/src/**', './dev/**', './cpp/*', './github/workflows/*') }} > cache-key
- name: Check existing caches
id: check-cache
uses: actions/cache/restore@v3
with:
lookup-only: true
path: |
./cpp/build/releases/
./cpp/build/velox/udf/examples/
./cpp/build/velox/benchmarks/
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }}
- name: Build Gluten native libraries
if: steps.check-cache.outputs.cache-hit != 'true'
run: |
df -a
bash dev/ci-velox-buildshared-centos-8.sh
- name: Cache
if: steps.check-cache.outputs.cache-hit != 'true'
id: cache
uses: actions/cache/save@v3
with:
path: |
./cpp/build/releases/
./cpp/build/velox/udf/examples/
./cpp/build/velox/benchmarks/
/root/.m2/repository/org/apache/arrow/
key: cache-velox-build-centos-8-${{ hashFiles('./cache-key') }}

# ccache-native-lib-ubuntu-velox-ut:
# runs-on: ubuntu-20.04
Expand All @@ -75,7 +112,7 @@ jobs:
# working-directory: ${{ github.workspace }}
# run: |
# mkdir -p '${{ env.CCACHE_DIR }}'
# - name: Build Gluten velox third party
# - name: Build Gluten native libraries
# run: |
# rm -rf /opt/miniconda-for-velox/
# cd ep/build-velox/src && \
Expand Down Expand Up @@ -113,7 +150,7 @@ jobs:
# working-directory: ${{ github.workspace }}
# run: |
# mkdir -p '${{ env.CCACHE_DIR }}'
# - name: Build Gluten velox third party
# - name: Build Gluten native libraries
# run: |
# rm -rf /opt/miniconda-for-velox/
# cd ep/build-velox/src && \
Expand Down
15 changes: 7 additions & 8 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ spark-shell \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=20g \
--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
--jars https://github.com/apache/incubator-gluten/releases/download/v1.0.0/gluten-velox-bundle-spark3.2_2.12-ubuntu_20.04_x86_64-1.0.0.jar
--jars https://github.com/apache/incubator-gluten/releases/download/v1.1.1/gluten-velox-bundle-spark3.2_2.12-ubuntu_20.04_x86_64-1.1.1.jar
```

# 3.2 Custom Build
Expand Down Expand Up @@ -120,8 +120,8 @@ Gluten successfully joined Apache Incubator since March'24. We welcome developer
https://gluten.apache.org/

### Mailing lists
For any technical discussion, please send email to [[email protected]](mailto:[email protected]). See [archives](https://lists.apache.org/[email protected]).
Please click [here](mailto:[email protected]) to subscribe.
For any technical discussion, please send email to [[email protected]](mailto:[email protected]). You can go to [archives](https://lists.apache.org/[email protected])
for getting historical discussions. Please click [here](mailto:[email protected]) to subscribe the mail list.

### Wechat group
We also have a Wechat group (in Chinese) which may be more friendly for PRC developers/users. Due to the limitation of wechat group, please contact with weitingchen at apache.org or zhangzc at apache.org to be invited to the group.
Expand Down Expand Up @@ -156,16 +156,15 @@ Gluten is licensed under [Apache 2.0 license](https://www.apache.org/licenses/LI

# 7 Contact

Gluten was initiated by Intel and Kyligence in 2022. Several companies such as Intel, Kyligence, BIGO, Meituan, Alibaba Cloud, NetEase, Baidu, Microsoft and others, are actively participating in the development of Gluten. If you are interested in Gluten project, please contact below email address for further discussion.
Gluten was initiated by Intel and Kyligence in 2022. Several companies are also actively participating in the development, such as BIGO, Meituan, Alibaba Cloud, NetEase, Baidu, Microsoft, etc. If you are interested in Gluten project, please contact and subscribe below mailing lists for further discussion.

[email protected]; [email protected];
[email protected]; [email protected]; [email protected];
[email protected];[email protected];[email protected];[email protected]
* For community activity: [email protected]
* For code repository activity: [email protected]

# 8 Thanks to our contributors

<a href="https://github.com/apache/incubator-gluten/graphs/contributors">
<img src="https://contrib.rocks/image?repo=apache/incubator-gluten" />
<img src="https://contrib.rocks/image?repo=apache/incubator-gluten&columns=25" />
</a>

##### \* LEGAL NOTICE: Your use of this software and any required dependent software (the "Software Package") is subject to the terms and conditions of the software license agreements for the Software Package, which may also include notices, disclaimers, or license terms for third party or open source software included in or with the Software Package, and your use indicates your acceptance of all such terms. Please refer to the "TPP.txt" or other similarly-named text file included with the Software Package for additional details.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ private Map<String, String> getNativeBackendConf() {
BackendsApiManager.getSettings().getBackendConfigPrefix(), SQLConf.get().getAllConfs());
}

public static void injectWriteFilesTempPath(String path, String fileName) {
throw new UnsupportedOperationException(
"injectWriteFilesTempPath Not supported in CHNativeExpressionEvaluator");
}

// Used by WholeStageTransform to create the native computing pipeline and
// return a columnar result iterator.
public BatchIterator createKernelWithBatchIterator(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,11 @@ import org.apache.gluten.substrait.rel.LocalFilesNode.ReadFileFormat._

import org.apache.spark.SparkEnv
import org.apache.spark.internal.Logging
import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.AggregateExpression
import org.apache.spark.sql.catalyst.plans.physical.{HashPartitioning, Partitioning}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.execution.aggregate.HashAggregateExec
import org.apache.spark.sql.execution.datasources.FileFormat
import org.apache.spark.sql.internal.SQLConf
import org.apache.spark.sql.types.{ArrayType, MapType, StructField, StructType}

Expand Down Expand Up @@ -286,17 +284,12 @@ object CHBackendSettings extends BackendSettingsApi with Logging {
.getLong(GLUTEN_MAX_SHUFFLE_READ_BYTES, GLUTEN_MAX_SHUFFLE_READ_BYTES_DEFAULT)
}

override def supportWriteFilesExec(
format: FileFormat,
fields: Array[StructField],
bucketSpec: Option[BucketSpec],
options: Map[String, String]): ValidationResult =
ValidationResult.failed("CH backend is unsupported.")

override def enableNativeWriteFiles(): Boolean = {
GlutenConfig.getConf.enableNativeWriter.getOrElse(false)
}

override def mergeTwoPhasesHashBaseAggregateIfNeed(): Boolean = true

override def supportCartesianProductExec(): Boolean = true

}
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,8 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil {
partition: InputPartition,
partitionSchema: StructType,
fileFormat: ReadFileFormat,
metadataColumnNames: Seq[String]): SplitInfo = {
metadataColumnNames: Seq[String],
properties: Map[String, String]): SplitInfo = {
partition match {
case p: GlutenMergeTreePartition =>
val partLists = new JArrayList[String]()
Expand Down Expand Up @@ -183,7 +184,8 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil {
partitionColumns,
new JArrayList[JMap[String, String]](),
fileFormat,
preferredLocations.toList.asJava
preferredLocations.toList.asJava,
mapAsJavaMap(properties)
)
case _ =>
throw new UnsupportedOperationException(s"Unsupported input partition: $partition.")
Expand All @@ -209,7 +211,6 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil {
split match {
case filesNode: LocalFilesNode =>
setFileSchemaForLocalFiles(filesNode, scans(i))
filesNode.setFileReadProperties(mapAsJavaMap(scans(i).getProperties))
filesNode.getPaths.forEach(f => files += f)
filesNode.toProtobuf.toByteArray
case extensionTableNode: ExtensionTableNode =>
Expand Down Expand Up @@ -290,6 +291,10 @@ class CHIteratorApi extends IteratorApi with Logging with LogLevelUtil {
None,
createNativeIterator(splitInfoByteArray, wsPlan, materializeInput, inputIterators))
}

override def injectWriteFilesTempPath(path: String, fileName: String): Unit = {
CHNativeExpressionEvaluator.injectWriteFilesTempPath(path, fileName)
}
}

class CollectMetricIterator(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,6 @@ import org.apache.spark.shuffle.{GenShuffleWriterParameters, GlutenShuffleWriter
import org.apache.spark.shuffle.utils.CHShuffleUtil
import org.apache.spark.sql.{SparkSession, Strategy}
import org.apache.spark.sql.catalyst.{CHAggregateFunctionRewriteRule, EqualToRewrite}
import org.apache.spark.sql.catalyst.catalog.BucketSpec
import org.apache.spark.sql.catalyst.catalog.CatalogTypes.TablePartitionSpec
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, CollectList, CollectSet}
import org.apache.spark.sql.catalyst.optimizer.BuildSide
Expand All @@ -49,7 +47,7 @@ import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.delta.files.TahoeFileIndex
import org.apache.spark.sql.execution._
import org.apache.spark.sql.execution.adaptive.AQEShuffleReadExec
import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation}
import org.apache.spark.sql.execution.datasources.{FileFormat, HadoopFsRelation, WriteJobDescription}
import org.apache.spark.sql.execution.datasources.parquet.ParquetFileFormat
import org.apache.spark.sql.execution.datasources.v2.clickhouse.source.DeltaMergeTreeFileFormat
import org.apache.spark.sql.execution.exchange.{BroadcastExchangeExec, ShuffleExchangeExec}
Expand Down Expand Up @@ -145,10 +143,10 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
}

child match {
case scan: FileSourceScanExec if (checkMergeTreeFileFormat(scan.relation)) =>
case scan: FileSourceScanExec if checkMergeTreeFileFormat(scan.relation) =>
// For the validation phase of the AddFallbackTagRule
CHFilterExecTransformer(condition, child)
case scan: FileSourceScanExecTransformerBase if (checkMergeTreeFileFormat(scan.relation)) =>
case scan: FileSourceScanExecTransformerBase if checkMergeTreeFileFormat(scan.relation) =>
// For the transform phase, the FileSourceScanExec is already transformed
CHFilterExecTransformer(condition, child)
case _ =>
Expand Down Expand Up @@ -364,8 +362,15 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
left: SparkPlan,
right: SparkPlan,
condition: Option[Expression]): CartesianProductExecTransformer =
throw new GlutenNotSupportException(
"CartesianProductExecTransformer is not supported in ch backend.")
if (!condition.isEmpty) {
throw new GlutenNotSupportException(
"CartesianProductExecTransformer with condition is not supported in ch backend.")
} else {
CartesianProductExecTransformer(
ColumnarCartesianProductBridge(left),
ColumnarCartesianProductBridge(right),
condition)
}

override def genBroadcastNestedLoopJoinExecTransformer(
left: SparkPlan,
Expand Down Expand Up @@ -395,7 +400,7 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
left: ExpressionTransformer,
right: ExpressionTransformer,
original: GetMapValue): ExpressionTransformer =
GetMapValueTransformer(substraitExprName, left, right, false, original)
GetMapValueTransformer(substraitExprName, left, right, failOnError = false, original)

/**
* Generate ShuffleDependency for ColumnarShuffleExchangeExec.
Expand Down Expand Up @@ -669,15 +674,8 @@ class CHSparkPlanExecApi extends SparkPlanExecApi {
CHRegExpReplaceTransformer(substraitExprName, children, expr)
}

override def createColumnarWriteFilesExec(
child: SparkPlan,
fileFormat: FileFormat,
partitionColumns: Seq[Attribute],
bucketSpec: Option[BucketSpec],
options: Map[String, String],
staticPartitions: TablePartitionSpec): SparkPlan = {
throw new GlutenNotSupportException("ColumnarWriteFilesExec is not support in ch backend.")
}
def createBackendWrite(description: WriteJobDescription): BackendWrite =
throw new UnsupportedOperationException("createBackendWrite is not supported in ch backend.")

override def createColumnarArrowEvalPythonExec(
udfs: Seq[PythonUDF],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,9 @@ case class CHBroadcastNestedLoopJoinExecTransformer(
// for ch
val joinParametersStr = new StringBuffer("JoinParameters:")
joinParametersStr
.append("isBHJ=")
.append(1)
.append("\n")
.append("buildHashTableId=")
.append(buildBroadcastTableId)
.append("\n")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -65,16 +65,6 @@ abstract class GlutenClickHouseTPCDSAbstractSuite
// Q45 BroadcastHashJoin, ExistenceJoin
// Q94 BroadcastHashJoin, LeftSemi, NOT condition
(false, false)
case j if j == 38 || j == 87 =>
// Q38 and Q87 : Hash shuffle is not supported for expression in some case
if (isAqe) {
(true, true)
} else {
(false, true)
}
case q77 if q77 == 77 && !isAqe =>
// Q77 CartesianProduct
(false, false)
case other => (true, false)
}
sqlNums.map((_, noFallBack._1, noFallBack._2))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,12 @@ class GlutenClickHouseTPCDSParquetSortMergeJoinSuite extends GlutenClickHouseTPC
"q14b",
"q23a",
"q23b",
"q38",
"q51",
"q69",
"q70",
"q78",
"q87",
"q95",
"q97"
) ++ super.excludedTpcdsQueries
Expand Down
Loading

0 comments on commit a565131

Please sign in to comment.