From 951ac483d6460367a72eccb70b3f663f7b344108 Mon Sep 17 00:00:00 2001 From: Lokesh Jain Date: Tue, 17 Dec 2024 23:16:50 +0530 Subject: [PATCH] [HUDI-8379] Expression Index should be used with the partition pruning rule --- .../org/apache/hudi/HoodieFileIndex.scala | 158 ++++++++++-------- 1 file changed, 87 insertions(+), 71 deletions(-) diff --git a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala index 3d1aea3ffa341..c6bf75f4d2513 100644 --- a/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala +++ b/hudi-spark-datasource/hudi-spark-common/src/main/scala/org/apache/hudi/HoodieFileIndex.scala @@ -30,7 +30,6 @@ import org.apache.hudi.exception.HoodieException import org.apache.hudi.keygen.{TimestampBasedAvroKeyGenerator, TimestampBasedKeyGenerator} import org.apache.hudi.storage.{StoragePath, StoragePathInfo} import org.apache.hudi.util.JFunction - import org.apache.hadoop.fs.{FileStatus, Path} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession @@ -45,8 +44,8 @@ import org.apache.spark.unsafe.types.UTF8String import java.text.SimpleDateFormat import java.util.stream.Collectors import javax.annotation.concurrent.NotThreadSafe - import scala.collection.JavaConverters._ +import scala.util.control.Breaks.{break, breakable} import scala.util.control.NonFatal import scala.util.{Failure, Success, Try} @@ -226,8 +225,13 @@ case class HoodieFileIndex(spark: SparkSession, def filterFileSlices(dataFilters: Seq[Expression], partitionFilters: Seq[Expression], isPartitionPruned: Boolean = false) : Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])] = { - val (isPruned, prunedPartitionsAndFileSlices) = + var (isPruned, prunedPartitionsAndFileSlices) = prunePartitionsAndGetFileSlices(dataFilters, partitionFilters) + if (!isPruned && partitionFilters.nonEmpty && isExpressionIndexEnabled) { + // val expressionIndexSupport = getExpressionIndexSupport() + // lazy val queryReferencedColumns = collectReferencedColumns(spark, partitionFilters, schema) + prunedPartitionsAndFileSlices = filterFileSlices(partitionFilters, false, prunedPartitionsAndFileSlices) + } hasPushedDownPartitionPredicates = true // If there are no data filters, return all the file slices. @@ -236,64 +240,59 @@ case class HoodieFileIndex(spark: SparkSession, if (prunedPartitionsAndFileSlices.isEmpty || dataFilters.isEmpty || isPartitionPruned ) { prunedPartitionsAndFileSlices } else { - // Look up candidate files names in the col-stats or record level index, if all of the following conditions are true - // - Data-skipping is enabled - // - Col-Stats Index is present - // - Record-level Index is present - // - List of predicates (filters) is present - val candidateFilesNamesOpt: Option[Set[String]] = - lookupCandidateFilesInMetadataTable(dataFilters, prunedPartitionsAndFileSlices, isPruned) match { - case Success(opt) => opt - case Failure(e) => - logError("Failed to lookup candidate files in File Index", e) - - spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match { - case DataSkippingFailureMode.Fallback.value => Option.empty - case DataSkippingFailureMode.Strict.value => throw new HoodieException(e); - } - } + filterFileSlices(dataFilters, isPruned, prunedPartitionsAndFileSlices) + } + } + + private def filterFileSlices(dataFilters: Seq[Expression], isPruned: Boolean, prunedPartitionsAndFileSlices: Seq[(Option[PartitionPath], Seq[FileSlice])]) = { + // Look up candidate files names in the col-stats or record level index, if all of the following conditions are true + // - Data-skipping is enabled + // - Col-Stats Index is present + // - Record-level Index is present + // - List of predicates (filters) is present + val candidateFilesNamesOpt: Option[Set[String]] = + lookupCandidateFilesInMetadataTable(dataFilters, prunedPartitionsAndFileSlices, isPruned) - logDebug(s"Overlapping candidate files from Column Stats or Record Level Index: ${candidateFilesNamesOpt.getOrElse(Set.empty)}") + logDebug(s"Overlapping candidate files from Column Stats or Record Level Index: ${candidateFilesNamesOpt.getOrElse(Set.empty)}") - var totalFileSliceSize = 0 - var candidateFileSliceSize = 0 + var totalFileSliceSize = 0 + var candidateFileSliceSize = 0 - val prunedPartitionsAndFilteredFileSlices = prunedPartitionsAndFileSlices.map { - case (partitionOpt, fileSlices) => - // Filter in candidate files based on the col-stats or record level index lookup - val candidateFileSlices: Seq[FileSlice] = { - fileSlices.filter(fs => { + val prunedPartitionsAndFilteredFileSlices = prunedPartitionsAndFileSlices.map { + case (partitionOpt, fileSlices) => + // Filter in candidate files based on the col-stats or record level index lookup + val candidateFileSlices: Seq[FileSlice] = { + fileSlices.filter(fs => { + if (candidateFilesNamesOpt.isDefined) { + // if any file in the file slice is part of candidate file names, we need to inclue the file slice. + // in other words, if all files in the file slice is not present in candidate file names, we can filter out the file slice. val fileSliceFiles = fs.getLogFiles.map[String](JFunction.toJavaFunction[HoodieLogFile, String](lf => lf.getPath.getName)) .collect(Collectors.toSet[String]) val baseFileStatusOpt = getBaseFileInfo(Option.apply(fs.getBaseFile.orElse(null))) baseFileStatusOpt.exists(f => fileSliceFiles.add(f.getPath.getName)) - if (candidateFilesNamesOpt.isDefined) { - // if any file in the file slice is part of candidate file names, we need to inclue the file slice. - // in other words, if all files in the file slice is not present in candidate file names, we can filter out the file slice. - fileSliceFiles.stream().filter(fileSliceFile => candidateFilesNamesOpt.get.contains(fileSliceFile)).findAny().isPresent - } else { - true - } - }) - } + fileSliceFiles.stream().filter(fileSliceFile => candidateFilesNamesOpt.get.contains(fileSliceFile)).findAny().isPresent + } else { + true + } + }) + } - totalFileSliceSize += fileSlices.size - candidateFileSliceSize += candidateFileSlices.size - (partitionOpt, candidateFileSlices) - } + totalFileSliceSize += fileSlices.size + candidateFileSliceSize += candidateFileSlices.size + (partitionOpt, candidateFileSlices) + } - val skippingRatio = - if (!areAllFileSlicesCached) -1 - else if (getAllFiles().nonEmpty && totalFileSliceSize > 0) - (totalFileSliceSize - candidateFileSliceSize) / totalFileSliceSize.toDouble - else 0 + val skippingRatio = + if (!areAllFileSlicesCached) -1 + else if (getAllFiles().nonEmpty && totalFileSliceSize > 0) + (totalFileSliceSize - candidateFileSliceSize) / totalFileSliceSize.toDouble + else 0 - logInfo(s"Total file slices: $totalFileSliceSize; " + - s"candidate file slices after data skipping: $candidateFileSliceSize; " + - s"skipping percentage $skippingRatio") + logInfo(s"Total file slices: $totalFileSliceSize; " + + s"candidate file slices after data skipping: $candidateFileSliceSize; " + + s"skipping percentage $skippingRatio") - prunedPartitionsAndFilteredFileSlices - } + prunedPartitionsAndFilteredFileSlices } /** @@ -346,6 +345,10 @@ case class HoodieFileIndex(spark: SparkSession, (Option.apply(partition), fileSlices.asScala.map(f => f.withLogFiles(includeLogFiles)).toSeq) }).toSeq) } + private def getExpressionIndexSupport(): ExpressionIndexSupport = { + indicesSupport.collectFirst({ case indexSupport: ExpressionIndexSupport => indexSupport }).get + } + /** * In the fast bootstrap read code path, it gets the path info for the bootstrap base file instead of * skeleton file. Returns path info for the base file if available. @@ -379,31 +382,44 @@ case class HoodieFileIndex(spark: SparkSession, // scalastyle:off return private def lookupCandidateFilesInMetadataTable(queryFilters: Seq[Expression], prunedPartitionsAndFileSlices: Seq[(Option[BaseHoodieTableFileIndex.PartitionPath], Seq[FileSlice])], - shouldPushDownFilesFilter: Boolean): Try[Option[Set[String]]] = Try { - // NOTE: For column stats, Data Skipping is only effective when it references columns that are indexed w/in - // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: - // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since - // CSI only contains stats for top-level columns, in this case for "struct") - // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's - // nothing CSI in particular could be applied for) - // For record index, Data Skipping is only effective when one of the query filter is of type EqualTo - // or IN query on simple record keys. In such a case the record index is used to filter the file slices - // and candidate files are obtained from these file slices. - - lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) - if (isDataSkippingEnabled) { - for(indexSupport: SparkBaseIndexSupport <- indicesSupport) { - if (indexSupport.isIndexAvailable && indexSupport.supportsQueryType(options)) { - val prunedFileNames = indexSupport.computeCandidateIsStrict(spark, this, queryFilters, queryReferencedColumns, - prunedPartitionsAndFileSlices, shouldPushDownFilesFilter) - if (prunedFileNames.nonEmpty) { - return Try(prunedFileNames) + shouldPushDownFilesFilter: Boolean): Option[Set[String]] = { + Try { + // NOTE: For column stats, Data Skipping is only effective when it references columns that are indexed w/in + // the Column Stats Index (CSI). Following cases could not be effectively handled by Data Skipping: + // - Expressions on top-level column's fields (ie, for ex filters like "struct.field > 0", since + // CSI only contains stats for top-level columns, in this case for "struct") + // - Any expression not directly referencing top-level column (for ex, sub-queries, since there's + // nothing CSI in particular could be applied for) + // For record index, Data Skipping is only effective when one of the query filter is of type EqualTo + // or IN query on simple record keys. In such a case the record index is used to filter the file slices + // and candidate files are obtained from these file slices. + lazy val queryReferencedColumns = collectReferencedColumns(spark, queryFilters, schema) + var prunedFileNamesOpt: Option[Set[String]] = Option.empty + if (isDataSkippingEnabled) { + breakable { + for (indexSupport: SparkBaseIndexSupport <- indicesSupport) { + if (indexSupport.isIndexAvailable && indexSupport.supportsQueryType(options)) { + prunedFileNamesOpt = indexSupport.computeCandidateIsStrict(spark, this, queryFilters, queryReferencedColumns, + prunedPartitionsAndFileSlices, shouldPushDownFilesFilter) + if (prunedFileNamesOpt.nonEmpty) { + break() + } + } } } } + validateConfig() + prunedFileNamesOpt + } match { + case Success(opt) => opt + case Failure(e) => + logError("Failed to lookup candidate files in File Index", e) + + spark.sqlContext.getConf(DataSkippingFailureMode.configName, DataSkippingFailureMode.Fallback.value) match { + case DataSkippingFailureMode.Fallback.value => Option.empty + case DataSkippingFailureMode.Strict.value => throw new HoodieException(e); + } } - validateConfig() - Option.empty } override def refresh(): Unit = {