From 67fa0192b29b90c14672f9abff083686da190b5a Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 25 Aug 2023 09:44:49 +0800 Subject: [PATCH 1/6] support allowDecimalPrecisionLoss Signed-off-by: Yuan Zhou --- cpp/core/config/GlutenConfig.h | 2 ++ cpp/velox/compute/WholeStageResultIterator.cc | 2 ++ ep/build-velox/src/get_velox.sh | 4 +-- .../expression/ExpressionConverter.scala | 8 ----- .../gluten/utils/DecimalArithmeticUtil.scala | 29 +++++++++++++++++-- .../org/apache/gluten/GlutenConfig.scala | 1 + 6 files changed, 33 insertions(+), 13 deletions(-) diff --git a/cpp/core/config/GlutenConfig.h b/cpp/core/config/GlutenConfig.h index 3c47fb5479bd..cf34b6a72c80 100644 --- a/cpp/core/config/GlutenConfig.h +++ b/cpp/core/config/GlutenConfig.h @@ -34,6 +34,8 @@ const std::string kLegacySize = "spark.sql.legacy.sizeOfNull"; const std::string kSessionTimezone = "spark.sql.session.timeZone"; +const std::string kAllowPrecisionLoss = "spark.sql.decimalOperations.allowPrecisionLoss"; + const std::string kIgnoreMissingFiles = "spark.sql.files.ignoreMissingFiles"; const std::string kDefaultSessionTimezone = "spark.gluten.sql.session.timeZone.default"; diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index 83749061c1b8..a142023ad7a8 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -490,6 +490,8 @@ std::unordered_map WholeStageResultIterator::getQueryC } // Adjust timestamp according to the above configured session timezone. configs[velox::core::QueryConfig::kAdjustTimestampToTimezone] = "true"; + // To align with Spark's behavior, allow decimal precision loss or not. + configs[velox::core::QueryConfig::kAllowPrecisionLoss] = veloxCfg_->get(kAllowPrecisionLoss, "true"); // Align Velox size function with Spark. configs[velox::core::QueryConfig::kSparkLegacySizeOfNull] = std::to_string(veloxCfg_->get(kLegacySize, true)); diff --git a/ep/build-velox/src/get_velox.sh b/ep/build-velox/src/get_velox.sh index c26bedd5e9af..0ba3ac2ea2fc 100755 --- a/ep/build-velox/src/get_velox.sh +++ b/ep/build-velox/src/get_velox.sh @@ -16,8 +16,8 @@ set -exu -VELOX_REPO=https://github.com/oap-project/velox.git -VELOX_BRANCH=2024_05_06 +VELOX_REPO=https://github.com/zhouyuan/velox.git +VELOX_BRANCH=wip_decimal_precision_loss VELOX_HOME="" #Set on run gluten on HDFS diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index 7815cbf69ebd..f22f12c3ef87 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -511,14 +511,6 @@ object ExpressionConverter extends SQLConfHelper with Logging { replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)), expr) case b: BinaryArithmetic if DecimalArithmeticUtil.isDecimalArithmetic(b) => - // PrecisionLoss=true: velox support / ch not support - // PrecisionLoss=false: velox not support / ch support - // TODO ch support PrecisionLoss=true - if (!BackendsApiManager.getSettings.allowDecimalArithmetic) { - throw new GlutenNotSupportException( - s"Not support ${SQLConf.DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key} " + - s"${conf.decimalOperationsAllowPrecisionLoss} mode") - } val rescaleBinary = if (BackendsApiManager.getSettings.rescaleDecimalLiteral) { DecimalArithmeticUtil.rescaleLiteral(b) } else { diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala index 621dcc061ec7..cf914a5cc010 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala @@ -22,6 +22,7 @@ import org.apache.gluten.expression.{CheckOverflowTransformer, ChildTransformer, import org.apache.spark.sql.catalyst.analysis.DecimalPrecision import org.apache.spark.sql.catalyst.expressions.{Add, BinaryArithmetic, Cast, Divide, Expression, Literal, Multiply, Pmod, PromotePrecision, Remainder, Subtract} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.{ByteType, Decimal, DecimalType, IntegerType, LongType, ShortType} object DecimalArithmeticUtil { @@ -33,12 +34,14 @@ object DecimalArithmeticUtil { val MIN_ADJUSTED_SCALE = 6 val MAX_PRECISION = 38 + val MAX_SCALE = 38 // Returns the result decimal type of a decimal arithmetic computing. def getResultTypeForOperation( operationType: OperationType.Config, type1: DecimalType, type2: DecimalType): DecimalType = { + val allowPrecisionLoss = SQLConf.get.decimalOperationsAllowPrecisionLoss var resultScale = 0 var resultPrecision = 0 operationType match { @@ -54,8 +57,20 @@ object DecimalArithmeticUtil { resultScale = type1.scale + type2.scale resultPrecision = type1.precision + type2.precision + 1 case OperationType.DIVIDE => - resultScale = Math.max(MIN_ADJUSTED_SCALE, type1.scale + type2.precision + 1) - resultPrecision = type1.precision - type1.scale + type2.scale + resultScale + if (allowPrecisionLoss) { + resultScale = Math.max(MIN_ADJUSTED_SCALE, type1.scale + type2.precision + 1) + resultPrecision = type1.precision - type1.scale + type2.scale + resultScale + } else { + var intDig = Math.min(MAX_SCALE, type1.precision - type1.scale + type2.scale) + var decDig = Math.min(MAX_SCALE, Math.max(6, type1.scale + type2.precision + 1)) + val diff = (intDig + decDig) - MAX_SCALE + if (diff > 0) { + decDig -= diff / 2 + 1 + intDig = MAX_SCALE - decDig + } + resultScale = intDig + decDig + resultPrecision = decDig + } case OperationType.MOD => resultScale = Math.max(type1.scale, type2.scale) resultPrecision = @@ -63,7 +78,11 @@ object DecimalArithmeticUtil { case other => throw new GlutenNotSupportException(s"$other is not supported.") } - adjustScaleIfNeeded(resultPrecision, resultScale) + if (allowPrecisionLoss) { + adjustScaleIfNeeded(resultPrecision, resultScale) + } else { + bounded(resultPrecision, resultScale) + } } // Returns the adjusted decimal type when the precision is larger the maximum. @@ -79,6 +98,10 @@ object DecimalArithmeticUtil { DecimalType(typePrecision, typeScale) } + def bounded(precision: Int, scale: Int): DecimalType = { + DecimalType(Math.min(precision, MAX_PRECISION), Math.min(scale, MAX_SCALE)) + } + // If casting between DecimalType, unnecessary cast is skipped to avoid data loss, // because argument input type of "cast" is actually the res type of "+-*/". // Cast will use a wider input type, then calculates result type with less scale than expected. diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index 3c7ddf32c71f..414eb189067b 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -571,6 +571,7 @@ object GlutenConfig { GLUTEN_DEFAULT_SESSION_TIMEZONE_KEY, SQLConf.LEGACY_SIZE_OF_NULL.key, "spark.io.compression.codec", + "spark.sql.decimalOperations.allowPrecisionLoss", COLUMNAR_VELOX_BLOOM_FILTER_EXPECTED_NUM_ITEMS.key, COLUMNAR_VELOX_BLOOM_FILTER_NUM_BITS.key, COLUMNAR_VELOX_BLOOM_FILTER_MAX_NUM_BITS.key, From 1d6e45847f449c21c12db6e80ba265ae5415d954 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 7 May 2024 07:41:40 +0800 Subject: [PATCH 2/6] fix ck backend Signed-off-by: Yuan Zhou --- .../apache/gluten/backendsapi/velox/VeloxBackend.scala | 2 +- .../apache/gluten/expression/ExpressionConverter.scala | 8 ++++++++ 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala index 00b67fca7500..825aac3a159c 100644 --- a/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala +++ b/backends-velox/src/main/scala/org/apache/gluten/backendsapi/velox/VeloxBackend.scala @@ -479,7 +479,7 @@ object VeloxBackendSettings extends BackendSettingsApi { override def supportTransformWriteFiles: Boolean = true - override def allowDecimalArithmetic: Boolean = SQLConf.get.decimalOperationsAllowPrecisionLoss + override def allowDecimalArithmetic: Boolean = true override def enableNativeWriteFiles(): Boolean = { GlutenConfig.getConf.enableNativeWriter.getOrElse( diff --git a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala index f22f12c3ef87..7815cbf69ebd 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/expression/ExpressionConverter.scala @@ -511,6 +511,14 @@ object ExpressionConverter extends SQLConfHelper with Logging { replaceWithExpressionTransformerInternal(_, attributeSeq, expressionsMap)), expr) case b: BinaryArithmetic if DecimalArithmeticUtil.isDecimalArithmetic(b) => + // PrecisionLoss=true: velox support / ch not support + // PrecisionLoss=false: velox not support / ch support + // TODO ch support PrecisionLoss=true + if (!BackendsApiManager.getSettings.allowDecimalArithmetic) { + throw new GlutenNotSupportException( + s"Not support ${SQLConf.DECIMAL_OPERATIONS_ALLOW_PREC_LOSS.key} " + + s"${conf.decimalOperationsAllowPrecisionLoss} mode") + } val rescaleBinary = if (BackendsApiManager.getSettings.rescaleDecimalLiteral) { DecimalArithmeticUtil.rescaleLiteral(b) } else { From b715dfd4488bab097fc64959945c7b6ee6eaf358 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 7 May 2024 07:47:09 +0800 Subject: [PATCH 3/6] add back config in native Signed-off-by: Yuan Zhou --- shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala | 1 + 1 file changed, 1 insertion(+) diff --git a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala index 414eb189067b..bd7523c23a4f 100644 --- a/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala +++ b/shims/common/src/main/scala/org/apache/gluten/GlutenConfig.scala @@ -645,6 +645,7 @@ object GlutenConfig { ("spark.hadoop.input.write.timeout", "180000"), ("spark.hadoop.dfs.client.log.severity", "INFO"), ("spark.sql.orc.compression.codec", "snappy"), + ("spark.sql.decimalOperations.allowPrecisionLoss", "true"), ( COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.key, COLUMNAR_VELOX_FILE_HANDLE_CACHE_ENABLED.defaultValueString), From 65eead90750a6836728df174d1dd8186eddbae13 Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Tue, 7 May 2024 11:23:19 +0800 Subject: [PATCH 4/6] fix Signed-off-by: Yuan Zhou --- .../scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala index cf914a5cc010..aa13c1bdbb8c 100644 --- a/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala +++ b/gluten-core/src/main/scala/org/apache/gluten/utils/DecimalArithmeticUtil.scala @@ -68,8 +68,8 @@ object DecimalArithmeticUtil { decDig -= diff / 2 + 1 intDig = MAX_SCALE - decDig } - resultScale = intDig + decDig - resultPrecision = decDig + resultPrecision = intDig + decDig + resultScale = decDig } case OperationType.MOD => resultScale = Math.max(type1.scale, type2.scale) From ac1e04d75b79db8616c4b7caf8c0cae84bfebe0b Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 5 Jul 2024 13:30:59 +0800 Subject: [PATCH 5/6] fix param name Signed-off-by: Yuan Zhou --- cpp/velox/compute/WholeStageResultIterator.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index 54ba6cc1fb9e..aaa6d2afd8c3 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -456,7 +456,7 @@ std::unordered_map WholeStageResultIterator::getQueryC configs[velox::core::QueryConfig::kAdjustTimestampToTimezone] = "true"; // To align with Spark's behavior, allow decimal precision loss or not. - configs[velox::core::QueryConfig::kAllowPrecisionLoss] = veloxCfg_->get(kAllowPrecisionLoss, "true"); + configs[velox::core::QueryConfig::kSparkDecimalOperationsAllowPrecisionLoss] = veloxCfg_->get(kAllowPrecisionLoss, "true"); { // partial aggregation memory config From 57bb881355076421e219c5379eddd81eeb1dca6b Mon Sep 17 00:00:00 2001 From: Yuan Zhou Date: Fri, 5 Jul 2024 13:38:59 +0800 Subject: [PATCH 6/6] fix format Signed-off-by: Yuan Zhou --- cpp/velox/compute/WholeStageResultIterator.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/velox/compute/WholeStageResultIterator.cc b/cpp/velox/compute/WholeStageResultIterator.cc index aaa6d2afd8c3..8439545ca382 100644 --- a/cpp/velox/compute/WholeStageResultIterator.cc +++ b/cpp/velox/compute/WholeStageResultIterator.cc @@ -456,7 +456,8 @@ std::unordered_map WholeStageResultIterator::getQueryC configs[velox::core::QueryConfig::kAdjustTimestampToTimezone] = "true"; // To align with Spark's behavior, allow decimal precision loss or not. - configs[velox::core::QueryConfig::kSparkDecimalOperationsAllowPrecisionLoss] = veloxCfg_->get(kAllowPrecisionLoss, "true"); + configs[velox::core::QueryConfig::kSparkDecimalOperationsAllowPrecisionLoss] = + veloxCfg_->get(kAllowPrecisionLoss, "true"); { // partial aggregation memory config