diff --git a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala index 7a1b90b20ef5c..ab9189264888f 100644 --- a/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala +++ b/backends-velox/src/test/scala/io/glutenproject/execution/TestOperator.scala @@ -675,4 +675,16 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla } } } + + test("Verify custom tokenizer") { + // https://github.com/apache/spark/pull/35229 Spark remove parquet column name check after 3.2 + if (!spark.version.contains("3.2")) { + withTable("t") { + sql("create table t using parquet as select sum(l_partkey) from lineitem") + runQueryAndCompare("select * from t") { + checkOperatorMatch[FileSourceScanExecTransformer] + } + } + } + } } diff --git a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc index b7102d8ba6d13..18acba2ba9e02 100644 --- a/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc +++ b/cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc @@ -50,27 +50,6 @@ static const std::unordered_set kBlackList = { "arrays_overlap", "approx_percentile"}; -bool validateColNames(const ::substrait::NamedStruct& schema) { - // Keeps the same logic with 'Tokenizer::isUnquotedPathCharacter' at - // https://github.com/oap-project/velox/blob/update/velox/type/Tokenizer.cpp#L154-L157. - auto isUnquotedPathCharacter = [](char c) { - return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || c == '.' || c == '-' || - c == '_' || isalnum(c); - }; - - for (const auto& name : schema.names()) { - for (auto i = 0; i < name.size(); i++) { - auto c = name[i]; - if (!isUnquotedPathCharacter(c)) { - std::cout << "native validation failed due to: Illegal column charactor " << c << "in column " << name - << std::endl; - return false; - } - } - } - return true; -} - } // namespace bool SubstraitToVeloxPlanValidator::validateInputTypes( @@ -1160,13 +1139,6 @@ bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ReadRel& readRel } } - if (readRel.has_base_schema()) { - const auto& baseSchema = readRel.base_schema(); - if (!validateColNames(baseSchema)) { - logValidateMsg("native validation failed due to: Validation failed for column name contains illegal charactor."); - return false; - } - } return true; } diff --git a/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala b/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala index 2e1b415e3f01c..6da593c7f8145 100644 --- a/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala +++ b/gluten-core/src/main/scala/io/glutenproject/expression/ConverterUtils.scala @@ -86,6 +86,10 @@ object ConverterUtils extends Logging { if (caseSensitive) name else name.toLowerCase(Locale.ROOT) } + def getAttributeName(attr: Attribute): String = { + normalizeColName(attr.name) + } + def getShortAttributeName(attr: Attribute): String = { val name = normalizeColName(attr.name) val subIndex = name.indexOf("(") @@ -123,7 +127,7 @@ object ConverterUtils extends Logging { // TODO: This is used only by `BasicScanExecTransformer`, // perhaps we can remove this in the future and use `withExprId` version consistently. def collectAttributeNamesWithoutExprId(attributes: Seq[Attribute]): JList[String] = { - collectAttributeNamesDFS(attributes)(genColumnNameWithoutExprId) + collectAttributeNamesDFS(attributes)(getAttributeName) } private def collectAttributeNamesDFS(attributes: Seq[Attribute])(