From 8be5b18f924c7965ed8ae5362ac8d121511e4efb Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Wed, 18 Dec 2024 23:07:28 +0800 Subject: [PATCH 1/8] fixup component file --- ...apache.gluten.component.CHIcebergComponent | 0 .../org.apache.gluten.component.Component | 1 - ...he.gluten.backendsapi.clickhouse.CHBackend | 0 .../org.apache.gluten.backend.Backend | 1 - ...che.gluten.component.VeloxIcebergComponent | 0 .../org.apache.gluten.component.Component | 1 - ...ache.gluten.backendsapi.velox.VeloxBackend | 0 .../org.apache.gluten.backend.Backend | 1 - .../org/apache/gluten/utils/ResourceUtil.java | 103 ++++++++++++++++++ .../apache/gluten/component/Discovery.scala | 82 ++++++++++++++ .../org/apache/gluten/component/package.scala | 9 +- .../apache/gluten/integration/BaseMixin.java | 3 + 12 files changed, 189 insertions(+), 12 deletions(-) create mode 100644 backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent delete mode 100644 backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component create mode 100644 backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend delete mode 100644 backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend create mode 100644 backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent delete mode 100644 backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component create mode 100644 backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend delete mode 100644 backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend create mode 100644 gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java create mode 100644 gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala diff --git a/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent b/backends-clickhouse/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.CHIcebergComponent new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component b/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component deleted file mode 100644 index a13f6fa739e8..000000000000 --- a/backends-clickhouse/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component +++ /dev/null @@ -1 +0,0 @@ -org.apache.gluten.component.CHIcebergComponent diff --git a/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend b/backends-clickhouse/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.clickhouse.CHBackend new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend b/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend deleted file mode 100644 index bcd3cb1c03a0..000000000000 --- a/backends-clickhouse/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend +++ /dev/null @@ -1 +0,0 @@ -org.apache.gluten.backendsapi.clickhouse.CHBackend diff --git a/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent b/backends-velox/src-iceberg/main/resources/META-INF/gluten-components/org.apache.gluten.component.VeloxIcebergComponent new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component b/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component deleted file mode 100644 index e9e844c6bb47..000000000000 --- a/backends-velox/src-iceberg/main/resources/META-INF/services/org.apache.gluten.component.Component +++ /dev/null @@ -1 +0,0 @@ -org.apache.gluten.component.VeloxIcebergComponent diff --git a/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend b/backends-velox/src/main/resources/META-INF/gluten-components/org.apache.gluten.backendsapi.velox.VeloxBackend new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend b/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend deleted file mode 100644 index 7cc9b395911f..000000000000 --- a/backends-velox/src/main/resources/META-INF/services/org.apache.gluten.backend.Backend +++ /dev/null @@ -1 +0,0 @@ -org.apache.gluten.backendsapi.velox.VeloxBackend diff --git a/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java new file mode 100644 index 000000000000..88f5fe745c78 --- /dev/null +++ b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java @@ -0,0 +1,103 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.utils; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Enumeration; +import java.util.List; +import java.util.regex.Pattern; +import java.util.zip.ZipEntry; +import java.util.zip.ZipException; +import java.util.zip.ZipFile; + +/** + * Code is copied from here + * and then modified for Gluten's use. + */ +public class ResourceUtil { + /** + * Get a collection of resource paths by the input RegEx pattern. + * + * @param pattern The pattern to match. + * @return The relative resource paths in the order they are found. + */ + public static List getResources(final Pattern pattern) { + final List buffer = new ArrayList<>(); + final String classPath = System.getProperty("java.class.path"); + final String[] classPathElements = classPath.split(File.pathSeparator); + for (final String element : classPathElements) { + getResources(element, pattern, buffer); + } + return Collections.unmodifiableList(buffer); + } + + private static void getResources( + final String element, final Pattern pattern, final List buffer) { + final File file = new File(element); + if (file.isDirectory()) { + getResourcesFromDirectory(file, file, pattern, buffer); + } else { + getResourcesFromJarFile(file, pattern, buffer); + } + } + + private static void getResourcesFromJarFile( + final File file, final Pattern pattern, final List buffer) { + ZipFile zf; + try { + zf = new ZipFile(file); + } catch (final ZipException e) { + throw new RuntimeException(e); + } catch (final IOException e) { + throw new RuntimeException(e); + } + final Enumeration e = zf.entries(); + while (e.hasMoreElements()) { + final ZipEntry ze = (ZipEntry) e.nextElement(); + final String fileName = ze.getName(); + final boolean accept = pattern.matcher(fileName).matches(); + if (accept) { + buffer.add(fileName); + } + } + try { + zf.close(); + } catch (final IOException e1) { + throw new RuntimeException(e1); + } + } + + private static void getResourcesFromDirectory( + final File root, final File directory, final Pattern pattern, final List buffer) { + final File[] fileList = directory.listFiles(); + for (final File file : fileList) { + if (file.isDirectory()) { + getResourcesFromDirectory(root, file, pattern, buffer); + } else { + final String relative = root.toURI().relativize(file.toURI()).getPath(); + final boolean accept = pattern.matcher(relative).matches(); + if (accept) { + buffer.add(relative); + } + } + } + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala new file mode 100644 index 000000000000..f8e6ced48479 --- /dev/null +++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.gluten.component + +import org.apache.gluten.exception.GlutenException +import org.apache.gluten.utils.ResourceUtil + +import org.apache.spark.internal.Logging +import org.apache.spark.util.SparkReflectionUtil + +import scala.collection.JavaConverters._ +import scala.util.matching.Regex + + + + +// format: off +/** + * Gluten's global discovery to find all [[Component]] definitions in the classpath. + * + * We don't use [[java.util.ServiceLoader]] since it requires all the service files have + * the same file name which is the class name of [[Component]], this causes the service files + * easily be overwritten during Maven build. Typically, See code of `DefaultMavenFileFilter` + * used by Maven's `maven-resources-plugin`. + * + * Instead, Gluten defines its own way to register components. For example, placing the following + * component files to resource folder: + * + * META-INF + * \- gluten-components + * |- org.apache.gluten.component.AComponent + * \- org.apache.gluten.backend.BBackend + * + * Will cause the registration of component `AComponent` and backend `BBackend`. + * + * The content in a component file is not read so doesn't matter at the moment. + */ +// format: on +private object Discovery extends Logging { + private val container: String = "META-INF/gluten-components" + private val componentFilePattern: Regex = s"^$container/(.+)$$".r + + def discoverAll(): Seq[Component] = { + logInfo("Start discovering components in the current classpath... ") + val prev = System.currentTimeMillis() + val allFiles = ResourceUtil.getResources(componentFilePattern.pattern).asScala + val duration = System.currentTimeMillis() - prev + logInfo(s"Discovered component files: ${allFiles.mkString(",")}. Duration: $duration ms.") + val out = allFiles + .flatMap { + case componentFilePattern(className) => + val clazz = + try { + SparkReflectionUtil.classForName(className) + } catch { + case e: ClassNotFoundException => + throw new GlutenException(s"Component class not found: $className", e) + } + val instance = clazz.getDeclaredConstructor().newInstance().asInstanceOf[Component] + Some(instance) + case _ => None + } + .distinct + .toSeq + out + } +} diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala index f74b96729418..032a32d04121 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/component/package.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/component/package.scala @@ -16,15 +16,10 @@ */ package org.apache.gluten -import org.apache.gluten.backend.Backend - import org.apache.spark.internal.Logging -import java.util.ServiceLoader import java.util.concurrent.atomic.AtomicBoolean -import scala.collection.JavaConverters._ - package object component extends Logging { private val allComponentsLoaded: AtomicBoolean = new AtomicBoolean(false) @@ -34,9 +29,7 @@ package object component extends Logging { } // Load all components in classpath. - val discoveredBackends = ServiceLoader.load(classOf[Backend]).asScala - val discoveredComponents = ServiceLoader.load(classOf[Component]).asScala - val all = discoveredBackends ++ discoveredComponents + val all = Discovery.discoverAll() // Register all components. all.foreach(_.ensureRegistered()) diff --git a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java index 08c55d78a67a..b369fffd740c 100644 --- a/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java +++ b/tools/gluten-it/common/src/main/java/org/apache/gluten/integration/BaseMixin.java @@ -22,6 +22,7 @@ import org.apache.gluten.integration.ds.TpcdsSuite; import org.apache.gluten.integration.h.TpchSuite; import org.apache.log4j.Level; +import org.apache.log4j.LogManager; import org.apache.spark.SparkConf; import picocli.CommandLine; import scala.Predef; @@ -120,6 +121,8 @@ public Integer runActions(Action[] actions) { throw new IllegalArgumentException("Log level not found: " + logLevel); } + LogManager.getRootLogger().setLevel(level); + scala.collection.immutable.Map extraSparkConfScala = JavaConverters.mapAsScalaMapConverter( mergeMapSafe(extraSparkConf, runModeEnumeration.extraSparkConf())).asScala().toMap( From 29f84ef7cb5648c5debf701bb1fafd9315aa7922 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Wed, 18 Dec 2024 23:38:37 +0800 Subject: [PATCH 2/8] fixup --- .../src/main/scala/org/apache/gluten/component/Discovery.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala index f8e6ced48479..9f31d8c0485c 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala @@ -60,7 +60,7 @@ private object Discovery extends Logging { val prev = System.currentTimeMillis() val allFiles = ResourceUtil.getResources(componentFilePattern.pattern).asScala val duration = System.currentTimeMillis() - prev - logInfo(s"Discovered component files: ${allFiles.mkString(",")}. Duration: $duration ms.") + logInfo(s"Discovered component files: ${allFiles.mkString(", ")}. Duration: $duration ms.") val out = allFiles .flatMap { case componentFilePattern(className) => From 570256a2496bd6b50706a48be3ddfa584dcc9f96 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Wed, 18 Dec 2024 23:39:32 +0800 Subject: [PATCH 3/8] fixup --- .../src/main/scala/org/apache/gluten/component/Discovery.scala | 2 -- 1 file changed, 2 deletions(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala index 9f31d8c0485c..43ffcf94f89a 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala @@ -27,8 +27,6 @@ import scala.collection.JavaConverters._ import scala.util.matching.Regex - - // format: off /** * Gluten's global discovery to find all [[Component]] definitions in the classpath. From f4415eb880bdbd1bd82d92de4b0ab05d7e217f15 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Wed, 18 Dec 2024 23:42:20 +0800 Subject: [PATCH 4/8] fixup --- .../main/scala/org/apache/gluten/component/Discovery.scala | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala index 43ffcf94f89a..253473d09b06 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala @@ -31,10 +31,10 @@ import scala.util.matching.Regex /** * Gluten's global discovery to find all [[Component]] definitions in the classpath. * - * We don't use [[java.util.ServiceLoader]] since it requires all the service files have + * We don't use [[java.util.ServiceLoader]] since it requires all the service files to have * the same file name which is the class name of [[Component]], this causes the service files - * easily be overwritten during Maven build. Typically, See code of `DefaultMavenFileFilter` - * used by Maven's `maven-resources-plugin`. + * easily be overwritten by each other during Maven build. Typically, See code of + * `DefaultMavenFileFilter` used by Maven's `maven-resources-plugin`. * * Instead, Gluten defines its own way to register components. For example, placing the following * component files to resource folder: From e2c08fcfab5eb1296dd9699a1c9a94af72c7ea6f Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 19 Dec 2024 09:04:21 +0800 Subject: [PATCH 5/8] fixup --- .../apache/gluten/component/Discovery.scala | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala index 253473d09b06..2b8f060a69f7 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/component/Discovery.scala @@ -24,9 +24,12 @@ import org.apache.spark.internal.Logging import org.apache.spark.util.SparkReflectionUtil import scala.collection.JavaConverters._ +import scala.collection.mutable import scala.util.matching.Regex + + // format: off /** * Gluten's global discovery to find all [[Component]] definitions in the classpath. @@ -59,9 +62,13 @@ private object Discovery extends Logging { val allFiles = ResourceUtil.getResources(componentFilePattern.pattern).asScala val duration = System.currentTimeMillis() - prev logInfo(s"Discovered component files: ${allFiles.mkString(", ")}. Duration: $duration ms.") - val out = allFiles - .flatMap { - case componentFilePattern(className) => + val deDup = mutable.Set[String]() + val out = allFiles.flatMap { + case componentFilePattern(className) => + if (!deDup.add(className)) { + logWarning(s"Found duplicated component class $className in then classpath, ignoring.") + None + } else { val clazz = try { SparkReflectionUtil.classForName(className) @@ -71,10 +78,9 @@ private object Discovery extends Logging { } val instance = clazz.getDeclaredConstructor().newInstance().asInstanceOf[Component] Some(instance) - case _ => None - } - .distinct - .toSeq + } + case _ => None + }.toSeq out } } From f405fcc7835ae1bb8304b5dbe22c41139554f3b2 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 19 Dec 2024 10:03:07 +0800 Subject: [PATCH 6/8] fixup --- .../gluten/backendsapi/BackendsApiManager.scala | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala index 3b4e97afb361..63ca4417679f 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala @@ -21,17 +21,17 @@ import org.apache.gluten.component.Component object BackendsApiManager { private lazy val backend: SubstraitBackend = initializeInternal() - /** Initialize all backends api. */ + /** Initialize all backends apis. */ private def initializeInternal(): SubstraitBackend = { val loadedSubstraitBackends = Component.sorted().filter(_.isInstanceOf[SubstraitBackend]) - assert(loadedSubstraitBackends.size == 1, "More than one Substrait backends are loaded") + assert( + loadedSubstraitBackends.size == 1, + s"More than one Substrait backends are loaded: " + + s"${loadedSubstraitBackends.map(_.name()).mkString(", ")}") loadedSubstraitBackends.head.asInstanceOf[SubstraitBackend] } - /** - * Automatically detect the backend api. - * @return - */ + /** Automatically detect the backend api. */ def initialize(): String = { getBackendName } From 183c02343e0cb816b583900886ce2149a7cf2abb Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 19 Dec 2024 10:47:18 +0800 Subject: [PATCH 7/8] fixup --- .../org/apache/gluten/backendsapi/BackendsApiManager.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala index 63ca4417679f..4b6f674905af 100644 --- a/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala +++ b/gluten-substrait/src/main/scala/org/apache/gluten/backendsapi/BackendsApiManager.scala @@ -26,7 +26,7 @@ object BackendsApiManager { val loadedSubstraitBackends = Component.sorted().filter(_.isInstanceOf[SubstraitBackend]) assert( loadedSubstraitBackends.size == 1, - s"More than one Substrait backends are loaded: " + + s"Zero or more than one Substrait backends are loaded: " + s"${loadedSubstraitBackends.map(_.name()).mkString(", ")}") loadedSubstraitBackends.head.asInstanceOf[SubstraitBackend] } From 18b594f9124f382b61fec09d3cb82430ce5bf932 Mon Sep 17 00:00:00 2001 From: Hongze Zhang Date: Thu, 19 Dec 2024 11:10:12 +0800 Subject: [PATCH 8/8] fixup --- .../java/org/apache/gluten/utils/ResourceUtil.java | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java index 88f5fe745c78..692a91af2667 100644 --- a/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java +++ b/gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java @@ -16,6 +16,9 @@ */ package org.apache.gluten.utils; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.io.File; import java.io.IOException; import java.util.ArrayList; @@ -33,6 +36,9 @@ * and then modified for Gluten's use. */ public class ResourceUtil { + + private static final Logger LOG = LoggerFactory.getLogger(ResourceUtil.class); + /** * Get a collection of resource paths by the input RegEx pattern. * @@ -52,6 +58,10 @@ public static List getResources(final Pattern pattern) { private static void getResources( final String element, final Pattern pattern, final List buffer) { final File file = new File(element); + if (!file.exists()) { + LOG.info("Skip non-existing classpath: {}", element); + return; + } if (file.isDirectory()) { getResourcesFromDirectory(file, file, pattern, buffer); } else {