From 2948b2128436222029d14b72a851e5e55d5eb9c2 Mon Sep 17 00:00:00 2001 From: Wenzheng Liu Date: Thu, 7 Dec 2023 12:40:40 +0800 Subject: [PATCH] [GLUTEN-3944][CH]Fix gluten.jar with delta20 when use spark 3.3 (#3947) --- backends-clickhouse/pom.xml | 9 ++-- .../gluten-source-exclude-sparkshims.xml | 26 ---------- ep/build-clickhouse/src/package.sh | 51 ++++++++++--------- .../src/resources/bin/gluten.sh | 14 +++-- 4 files changed, 36 insertions(+), 64 deletions(-) delete mode 100644 backends-clickhouse/src/main/resources/gluten-source-exclude-sparkshims.xml diff --git a/backends-clickhouse/pom.xml b/backends-clickhouse/pom.xml index dd3a495d6bf1..fd340f98ac9b 100644 --- a/backends-clickhouse/pom.xml +++ b/backends-clickhouse/pom.xml @@ -207,11 +207,10 @@ maven-assembly-plugin 3.3.0 - ${jar.assembly.name.prefix}-${project.version}-spark-${sparkbundle.version} - - - src/main/resources/gluten-source-exclude-sparkshims.xml - + ${jar.assembly.name.prefix}-${project.version}-spark-${sparkbundle.version} + + jar-with-dependencies + diff --git a/backends-clickhouse/src/main/resources/gluten-source-exclude-sparkshims.xml b/backends-clickhouse/src/main/resources/gluten-source-exclude-sparkshims.xml deleted file mode 100644 index 05f1356b546a..000000000000 --- a/backends-clickhouse/src/main/resources/gluten-source-exclude-sparkshims.xml +++ /dev/null @@ -1,26 +0,0 @@ - - - - jar-with-dependencies-exclude-sparkshims - - jar - - false - - - / - true - runtime - - io.glutenproject:${sparkshim.artifactId} - - - - - - / - ${project.build.outputDirectory} - - - \ No newline at end of file diff --git a/ep/build-clickhouse/src/package.sh b/ep/build-clickhouse/src/package.sh index 8577d2c8f5a0..0049a5f14439 100755 --- a/ep/build-clickhouse/src/package.sh +++ b/ep/build-clickhouse/src/package.sh @@ -60,9 +60,8 @@ mkdir -p "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/bin mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/conf mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars -mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/extraJars -mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/extraJars/spark32 -mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/extraJars/spark33 +mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars/spark32 +mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/jars/spark33 mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/libs mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/logs @@ -78,37 +77,39 @@ mkdir "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/logs cp "${GLUTEN_SOURCE}"/LICENSE "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" cp "${GLUTEN_SOURCE}"/README.md "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}" -# build gluten jar -cd "${GLUTEN_SOURCE}" -mvn clean package -Pbackends-clickhouse -Pspark-3.2 -Prss -DskipTests -Dcheckstyle.skip -mvn clean package -Pspark-3.3 -am -pl shims/spark33 -DskipTests -Dcheckstyle.skip +# build gluten with spark32 +mvn clean install -Pbackends-clickhouse -Pspark-3.2 -Prss -DskipTests -Dcheckstyle.skip +cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-spark-3.2-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark32/gluten.jar +cp "${GLUTEN_SOURCE}"/gluten-celeborn/clickhouse/target/gluten-celeborn-clickhouse-${PROJECT_VERSION}-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark32 +delta_version_32=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-3.2 --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/${delta_version_32}/delta-core_2.12-${delta_version_32}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark32 +wget https://repo1.maven.org/maven2/io/delta/delta-storage/${delta_version_32}/delta-storage-${delta_version_32}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark32 + +# build gluten with spark33 +mvn clean install -Pbackends-clickhouse -Pspark-3.3 -Prss -DskipTests -Dcheckstyle.skip +cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-spark-3.3-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark33/gluten.jar +cp "${GLUTEN_SOURCE}"/gluten-celeborn/clickhouse/target/gluten-celeborn-clickhouse-${PROJECT_VERSION}-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars/spark33 +delta_version_33=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-3.3 --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/${delta_version_33}/delta-core_2.12-${delta_version_33}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark33 +wget https://repo1.maven.org/maven2/io/delta/delta-storage/${delta_version_33}/delta-storage-${delta_version_33}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark33 + +# download common 3rd party jars +protobuf_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${protobuf.version}' --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/${protobuf_version}/protobuf-java-${protobuf_version}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark32 +cp "${PACKAGE_DIR_PATH}"/jars/spark32/protobuf-java-${protobuf_version}.jar "${PACKAGE_DIR_PATH}"/jars/spark33 + +celeborn_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${celeborn.version}' --non-recursive exec:exec) +wget https://repo1.maven.org/maven2/org/apache/celeborn/celeborn-client-spark-3-shaded_2.12/${celeborn_version}/celeborn-client-spark-3-shaded_2.12-${celeborn_version}.jar -P "${PACKAGE_DIR_PATH}"/jars/spark32 +cp "${PACKAGE_DIR_PATH}"/jars/spark32/celeborn-client-spark-3-shaded_2.12-${celeborn_version}.jar "${PACKAGE_DIR_PATH}"/jars/spark33 # build libch.so bash "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/build_clickhouse.sh - -# copy gluten jar and libch.so -cp "${GLUTEN_SOURCE}"/backends-clickhouse/target/gluten-*-jar-with-dependencies-exclude-sparkshims.jar "${PACKAGE_DIR_PATH}"/jars/gluten.jar -cp "${GLUTEN_SOURCE}"/gluten-celeborn/clickhouse/target/gluten-celeborn-clickhouse-${PROJECT_VERSION}-jar-with-dependencies.jar "${PACKAGE_DIR_PATH}"/jars cp "$GLUTEN_SOURCE"/cpp-ch/build/utils/extern-local-engine/libch.so "${PACKAGE_DIR_PATH}"/libs/libch.so -cp "${GLUTEN_SOURCE}"/shims/spark32/target/spark-*-${PROJECT_VERSION}.jar "${PACKAGE_DIR_PATH}"/extraJars/spark32/gluten-spark32-shims.jar -cp "${GLUTEN_SOURCE}"/shims/spark33/target/spark-*-${PROJECT_VERSION}.jar "${PACKAGE_DIR_PATH}"/extraJars/spark33/gluten-spark33-shims.jar # copy bin and conf cp "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/resources/bin/* "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/bin cp "${GLUTEN_SOURCE}"/ep/build-clickhouse/src/resources/conf/* "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/conf -# download 3rd party jars -protobuf_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${protobuf.version}' --non-recursive exec:exec) -wget https://repo1.maven.org/maven2/com/google/protobuf/protobuf-java/${protobuf_version}/protobuf-java-${protobuf_version}.jar -P "${PACKAGE_DIR_PATH}"/jars -celeborn_version=$(mvn -q -Dexec.executable="echo" -Dexec.args='${celeborn.version}' --non-recursive exec:exec) -wget https://repo1.maven.org/maven2/org/apache/celeborn/celeborn-client-spark-3-shaded_2.12/${celeborn_version}/celeborn-client-spark-3-shaded_2.12-${celeborn_version}.jar -P "${PACKAGE_DIR_PATH}"/jars -delta_version_32=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-3.2 --non-recursive exec:exec) -wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/${delta_version_32}/delta-core_2.12-${delta_version_32}.jar -P "${PACKAGE_DIR_PATH}"/extraJars/spark32 -wget https://repo1.maven.org/maven2/io/delta/delta-storage/${delta_version_32}/delta-storage-${delta_version_32}.jar -P "${PACKAGE_DIR_PATH}"/extraJars/spark32 -delta_version_33=$(mvn -q -Dexec.executable="echo" -Dexec.args='${delta.version}' -Pspark-3.3 --non-recursive exec:exec) -wget https://repo1.maven.org/maven2/io/delta/delta-core_2.12/${delta_version_33}/delta-core_2.12-${delta_version_33}.jar -P "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/extraJars/spark33 -wget https://repo1.maven.org/maven2/io/delta/delta-storage/${delta_version_33}/delta-storage-${delta_version_33}.jar -P "${GLUTEN_SOURCE}"/dist/"${PACKAGE_NAME}"/extraJars/spark33 - # build tar.gz cd "${GLUTEN_SOURCE}"/dist tar -czf "${PACKAGE_NAME}".tar.gz "${PACKAGE_NAME}" diff --git a/ep/build-clickhouse/src/resources/bin/gluten.sh b/ep/build-clickhouse/src/resources/bin/gluten.sh index f30cab40a894..f45d21b69306 100755 --- a/ep/build-clickhouse/src/resources/bin/gluten.sh +++ b/ep/build-clickhouse/src/resources/bin/gluten.sh @@ -35,18 +35,16 @@ function start() { DRIVER_OPTIONS=${DRIVER_OPTIONS:-"-Dlog4j.configuration=file:${GLUTEN_HOME}/conf/log4j.properties"} DRIVER_OPTIONS="${DRIVER_OPTIONS} $(cat ${GLUTEN_HOME}/conf/gluten.properties | grep "^spark.driver.extraJavaOptions" | cut -d "=" -f 2)" - GLUTEN_JARS=${GLUTEN_HOME}/jars/* - echo "GLUTEN_JARS: ${GLUTEN_JARS} will be loaded." - + GLUTEN_JARS= if [ "${SPARK_MAJOR_MINOR_VERSION}" == "3.2" ]; then - EXTRA_JARS=${GLUTEN_HOME}/extraJars/spark33/* + GLUTEN_JARS=${GLUTEN_HOME}/jars/spark32/* elif [ "${SPARK_MAJOR_MINOR_VERSION}" == "3.3" ]; then - EXTRA_JARS=${GLUTEN_HOME}/extraJars/spark33/* + GLUTEN_JARS=${GLUTEN_HOME}/jars/spark33/* else echo "Unsupported spark version: ${SPARK_MAJOR_MINOR_VERSION}" exit 1 fi - echo "EXTRA_JARS: ${EXTRA_JARS} will be loaded." + echo "GLUTEN_JARS: ${GLUTEN_JARS} will be loaded." export LD_PRELOAD=${GLUTEN_HOME}/libs/libch.so export SPARK_LOG_DIR=${GLUTEN_HOME}/logs @@ -54,8 +52,8 @@ function start() { rm -f ${GLUTEN_HOME}/logs/spark-*.out* nohup ${SPARK_HOME}/sbin/start-thriftserver.sh \ --properties-file ${GLUTEN_HOME}/conf/spark-default.conf \ - --conf spark.driver.extraClassPath=${GLUTEN_JARS}:${EXTRA_JARS} \ - --conf spark.executor.extraClassPath=${GLUTEN_JARS}:${EXTRA_JARS} \ + --conf spark.driver.extraClassPath=${GLUTEN_JARS} \ + --conf spark.executor.extraClassPath=${GLUTEN_JARS} \ --conf spark.driver.extraJavaOptions=${DRIVER_OPTIONS} \ --conf spark.gluten.sql.columnar.libpath=${GLUTEN_HOME}/libs/libch.so \ --verbose \