Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CORE] Use component file to discover components #8271

Merged
merged 8 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

This file was deleted.

This file was deleted.

This file was deleted.

113 changes: 113 additions & 0 deletions gluten-core/src/main/java/org/apache/gluten/utils/ResourceUtil.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.gluten.utils;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Enumeration;
import java.util.List;
import java.util.regex.Pattern;
import java.util.zip.ZipEntry;
import java.util.zip.ZipException;
import java.util.zip.ZipFile;

/**
* Code is copied from <a
* href="https://stackoverflow.com/questions/3923129/get-a-list-of-resources-from-classpath-directory">here</a>
* and then modified for Gluten's use.
*/
public class ResourceUtil {

private static final Logger LOG = LoggerFactory.getLogger(ResourceUtil.class);

/**
* Get a collection of resource paths by the input RegEx pattern.
*
* @param pattern The pattern to match.
* @return The relative resource paths in the order they are found.
*/
public static List<String> getResources(final Pattern pattern) {
final List<String> buffer = new ArrayList<>();
final String classPath = System.getProperty("java.class.path");
final String[] classPathElements = classPath.split(File.pathSeparator);
for (final String element : classPathElements) {
getResources(element, pattern, buffer);
}
return Collections.unmodifiableList(buffer);
}

private static void getResources(
final String element, final Pattern pattern, final List<String> buffer) {
final File file = new File(element);
if (!file.exists()) {
LOG.info("Skip non-existing classpath: {}", element);
return;
}
if (file.isDirectory()) {
getResourcesFromDirectory(file, file, pattern, buffer);
} else {
getResourcesFromJarFile(file, pattern, buffer);
}
}

private static void getResourcesFromJarFile(
final File file, final Pattern pattern, final List<String> buffer) {
ZipFile zf;
try {
zf = new ZipFile(file);
} catch (final ZipException e) {
throw new RuntimeException(e);
} catch (final IOException e) {
throw new RuntimeException(e);
}
final Enumeration e = zf.entries();
while (e.hasMoreElements()) {
final ZipEntry ze = (ZipEntry) e.nextElement();
final String fileName = ze.getName();
final boolean accept = pattern.matcher(fileName).matches();
if (accept) {
buffer.add(fileName);
}
}
try {
zf.close();
} catch (final IOException e1) {
throw new RuntimeException(e1);
}
}

private static void getResourcesFromDirectory(
final File root, final File directory, final Pattern pattern, final List<String> buffer) {
final File[] fileList = directory.listFiles();
for (final File file : fileList) {
if (file.isDirectory()) {
getResourcesFromDirectory(root, file, pattern, buffer);
} else {
final String relative = root.toURI().relativize(file.toURI()).getPath();
final boolean accept = pattern.matcher(relative).matches();
if (accept) {
buffer.add(relative);
}
}
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,86 @@
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package org.apache.gluten.component

import org.apache.gluten.exception.GlutenException
import org.apache.gluten.utils.ResourceUtil

import org.apache.spark.internal.Logging
import org.apache.spark.util.SparkReflectionUtil

import scala.collection.JavaConverters._
import scala.collection.mutable
import scala.util.matching.Regex




// format: off
/**
* Gluten's global discovery to find all [[Component]] definitions in the classpath.
*
* We don't use [[java.util.ServiceLoader]] since it requires all the service files to have
* the same file name which is the class name of [[Component]], this causes the service files
* easily be overwritten by each other during Maven build. Typically, See code of
* `DefaultMavenFileFilter` used by Maven's `maven-resources-plugin`.
*
* Instead, Gluten defines its own way to register components. For example, placing the following
* component files to resource folder:
*
* META-INF
* \- gluten-components
* |- org.apache.gluten.component.AComponent
* \- org.apache.gluten.backend.BBackend
*
* Will cause the registration of component `AComponent` and backend `BBackend`.
*
* The content in a component file is not read so doesn't matter at the moment.
*/
// format: on
private object Discovery extends Logging {
private val container: String = "META-INF/gluten-components"
private val componentFilePattern: Regex = s"^$container/(.+)$$".r

def discoverAll(): Seq[Component] = {
logInfo("Start discovering components in the current classpath... ")
val prev = System.currentTimeMillis()
val allFiles = ResourceUtil.getResources(componentFilePattern.pattern).asScala
val duration = System.currentTimeMillis() - prev
logInfo(s"Discovered component files: ${allFiles.mkString(", ")}. Duration: $duration ms.")
val deDup = mutable.Set[String]()
val out = allFiles.flatMap {
case componentFilePattern(className) =>
if (!deDup.add(className)) {
logWarning(s"Found duplicated component class $className in then classpath, ignoring.")
None
} else {
val clazz =
try {
SparkReflectionUtil.classForName(className)
} catch {
case e: ClassNotFoundException =>
throw new GlutenException(s"Component class not found: $className", e)
}
val instance = clazz.getDeclaredConstructor().newInstance().asInstanceOf[Component]
Some(instance)
}
case _ => None
}.toSeq
out
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,10 @@
*/
package org.apache.gluten

import org.apache.gluten.backend.Backend

import org.apache.spark.internal.Logging

import java.util.ServiceLoader
import java.util.concurrent.atomic.AtomicBoolean

import scala.collection.JavaConverters._

package object component extends Logging {
private val allComponentsLoaded: AtomicBoolean = new AtomicBoolean(false)

Expand All @@ -34,9 +29,7 @@ package object component extends Logging {
}

// Load all components in classpath.
val discoveredBackends = ServiceLoader.load(classOf[Backend]).asScala
val discoveredComponents = ServiceLoader.load(classOf[Component]).asScala
val all = discoveredBackends ++ discoveredComponents
val all = Discovery.discoverAll()

// Register all components.
all.foreach(_.ensureRegistered())
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,17 @@ import org.apache.gluten.component.Component
object BackendsApiManager {
private lazy val backend: SubstraitBackend = initializeInternal()

/** Initialize all backends api. */
/** Initialize all backends apis. */
private def initializeInternal(): SubstraitBackend = {
val loadedSubstraitBackends = Component.sorted().filter(_.isInstanceOf[SubstraitBackend])
assert(loadedSubstraitBackends.size == 1, "More than one Substrait backends are loaded")
assert(
loadedSubstraitBackends.size == 1,
s"Zero or more than one Substrait backends are loaded: " +
s"${loadedSubstraitBackends.map(_.name()).mkString(", ")}")
loadedSubstraitBackends.head.asInstanceOf[SubstraitBackend]
}

/**
* Automatically detect the backend api.
* @return
*/
/** Automatically detect the backend api. */
def initialize(): String = {
getBackendName
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
import org.apache.gluten.integration.ds.TpcdsSuite;
import org.apache.gluten.integration.h.TpchSuite;
import org.apache.log4j.Level;
import org.apache.log4j.LogManager;
import org.apache.spark.SparkConf;
import picocli.CommandLine;
import scala.Predef;
Expand Down Expand Up @@ -120,6 +121,8 @@ public Integer runActions(Action[] actions) {
throw new IllegalArgumentException("Log level not found: " + logLevel);
}

LogManager.getRootLogger().setLevel(level);

scala.collection.immutable.Map<String, String> extraSparkConfScala =
JavaConverters.mapAsScalaMapConverter(
mergeMapSafe(extraSparkConf, runModeEnumeration.extraSparkConf())).asScala().toMap(
Expand Down
Loading