Skip to content

Commit

Permalink
[GLUTEN-3962][VL] Respect parsed attribute name and remove column nam…
Browse files Browse the repository at this point in the history
…e validate logic
  • Loading branch information
Yohahaha authored Dec 13, 2023
1 parent a329f96 commit 8edcb72
Show file tree
Hide file tree
Showing 3 changed files with 21 additions and 30 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,10 @@
package io.glutenproject.execution

import io.glutenproject.GlutenConfig
import io.glutenproject.sql.shims.SparkShimLoader

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, Row}
import org.apache.spark.sql.{AnalysisException, DataFrame, Row}
import org.apache.spark.sql.execution.{GenerateExec, RDDScanExec}
import org.apache.spark.sql.execution.adaptive.AdaptiveSparkPlanHelper
import org.apache.spark.sql.functions.{avg, col, udf}
Expand Down Expand Up @@ -675,4 +676,22 @@ class TestOperator extends VeloxWholeStageTransformerSuite with AdaptiveSparkPla
}
}
}

test("Verify parquet field name with special character") {
withTable("t") {

// https://github.com/apache/spark/pull/35229 Spark remove parquet field name check after 3.2
if (!SparkShimLoader.getSparkVersion.startsWith("3.2")) {
sql("create table t using parquet as select sum(l_partkey) from lineitem")
runQueryAndCompare("select * from t") {
checkOperatorMatch[FileSourceScanExecTransformer]
}
} else {
val msg = intercept[AnalysisException] {
sql("create table t using parquet as select sum(l_partkey) from lineitem")
}.message
assert(msg.contains("contains invalid character"))
}
}
}
}
28 changes: 0 additions & 28 deletions cpp/velox/substrait/SubstraitToVeloxPlanValidator.cc
Original file line number Diff line number Diff line change
Expand Up @@ -50,27 +50,6 @@ static const std::unordered_set<std::string> kBlackList = {
"arrays_overlap",
"approx_percentile"};

bool validateColNames(const ::substrait::NamedStruct& schema) {
// Keeps the same logic with 'Tokenizer::isUnquotedPathCharacter' at
// https://github.com/oap-project/velox/blob/update/velox/type/Tokenizer.cpp#L154-L157.
auto isUnquotedPathCharacter = [](char c) {
return c == ':' || c == '$' || c == '-' || c == '/' || c == '@' || c == '|' || c == '#' || c == '.' || c == '-' ||
c == '_' || isalnum(c);
};

for (const auto& name : schema.names()) {
for (auto i = 0; i < name.size(); i++) {
auto c = name[i];
if (!isUnquotedPathCharacter(c)) {
std::cout << "native validation failed due to: Illegal column charactor " << c << "in column " << name
<< std::endl;
return false;
}
}
}
return true;
}

} // namespace

bool SubstraitToVeloxPlanValidator::validateInputTypes(
Expand Down Expand Up @@ -1160,13 +1139,6 @@ bool SubstraitToVeloxPlanValidator::validate(const ::substrait::ReadRel& readRel
}
}

if (readRel.has_base_schema()) {
const auto& baseSchema = readRel.base_schema();
if (!validateColNames(baseSchema)) {
logValidateMsg("native validation failed due to: Validation failed for column name contains illegal charactor.");
return false;
}
}
return true;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ object ConverterUtils extends Logging {
// TODO: This is used only by `BasicScanExecTransformer`,
// perhaps we can remove this in the future and use `withExprId` version consistently.
def collectAttributeNamesWithoutExprId(attributes: Seq[Attribute]): JList[String] = {
collectAttributeNamesDFS(attributes)(genColumnNameWithoutExprId)
collectAttributeNamesDFS(attributes)(attr => normalizeColName(attr.name))
}

private def collectAttributeNamesDFS(attributes: Seq[Attribute])(
Expand Down

0 comments on commit 8edcb72

Please sign in to comment.