diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala index 7a1b90b20ef5..5216a32ad3eb 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala @@ -17,9 +17,10 @@ package io.glutenproject.execution import io.glutenproject.GlutenConfig +import io.glutenproject.sql.shims.SparkShimLoader import org.apache.spark.SparkConf -import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.{AnalysisException, DataFrame, Row} import org.apache.spark.sql.execution.{GenerateExec, RDDScanExec} import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper import org.apache.spark.sql.functions.{avg, col, udf} @@ -675,4 +676,22 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla } } } + + test("Verify parquet field name with special character") { + withTable("t") { + + // https://github.com/apache/spark/pull/35229 Spark remove parquet field name check after 3.2 + if (!SparkShimLoader.getSparkVersion.startsWith("3.2")) { + sql("create table t using parquet as select sum(l_partkey) from lineitem") + runQueryAndCompare("select * from t") { + checkOperatorMatch[FileSourceScanExecTransformer] + } + } else { + val msg = intercept[AnalysisException] { + sql("create table t using parquet as select sum(l_partkey) from lineitem") + }.message + assert(msg.contains("contains invalid character")) + } + } + } } diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc index b7102d8ba6d1..18acba2ba9e0 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc @@ -50,27 +50,6 @@ static const std::unordered_set kBlackList = { "arrays_overlap", "approx_percentile"}; -bool validateColNames(const ::substrait::NamedStruct& schema) { - // Keeps the same logic with 'Tokenizer::isUnquotedPathCharacter' at - // https://github.com/oap-project/velox/blob/update/velox/type/Tokenizer.cpp#L154-L157. - auto isUnquotedPathCharacter = [](char c) { - return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || c == '.' || c == '-' || - c == '_' || isalnum(c); - }; - - for (const auto& name : schema.names()) { - for (auto i = 0; i < name.size(); i++) { - auto c = name[i]; - if (!isUnquotedPathCharacter(c)) { - std::cout << "native validation failed due to: Illegal column charactor " << c << "in column " << name - << std::endl; - return false; - } - } - } - return true; -} - } // namespace bool SubstraitToVeloxPlanValidator::validateInputTypes( @@ -1160,13 +1139,6 @@ bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ReadRel& readRel } } - if (readRel.has_base_schema()) { - const auto& baseSchema = readRel.base_schema(); - if (!validateColNames(baseSchema)) { - logValidateMsg("native validation failed due to: Validation failed for column name contains illegal charactor."); - return false; - } - } return true; } diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala index 2e1b415e3f01..018484c7c1fa 100644 --- a/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala +++ b/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala @@ -123,7 +123,7 @@ object ConverterUtils extends Logging { // TODO: This is used only by `BasicScanExecTransformer`, // perhaps we can remove this in the future and use `withExprId` version consistently. def collectAttributeNamesWithoutExprId(attributes: Seq[Attribute]): JList[String] = { - collectAttributeNamesDFS(attributes)(genColumnNameWithoutExprId) + collectAttributeNamesDFS(attributes)(attr => normalizeColName(attr.name)) } private def collectAttributeNamesDFS(attributes: Seq[Attribute])(