diff --git a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala index 57fda77144dc..83bc4e76b1bd 100644 --- a/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala +++ b/backends-clickhouse/src/test/scala/org/apache/gluten/execution/GlutenClickHouseHiveTableSuite.scala @@ -1314,4 +1314,30 @@ class GlutenClickHouseHiveTableSuite compareResultsAgainstVanillaSpark(select_sql, true, _ => {}) spark.sql("drop table test_tbl_6506") } + + test("GLUTEN-6879: Fix partition value diff when it contains blanks") { + val tableName = "test_tbl_6879" + sql(s"drop table if exists $tableName") + + val createSql = + s""" + |CREATE TABLE $tableName ( + | id INT, + | name STRING + |) PARTITIONED BY (part STRING) + |STORED AS PARQUET; + |""".stripMargin + sql(createSql) + + val insertSql = + s""" + |INSERT INTO $tableName PARTITION (part='part with spaces') + |VALUES (1, 'John Doe'); + |""".stripMargin + sql(insertSql) + + val selectSql = s"SELECT * FROM $tableName" + compareResultsAgainstVanillaSpark(selectSql, true, _ => {}) + sql(s"drop table if exists $tableName") + } } diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp index b6d11ac1b267..4a18f4ceda02 100644 --- a/cpp-ch/local-engine/Common/GlutenStringUtils.cpp +++ b/cpp-ch/local-engine/Common/GlutenStringUtils.cpp @@ -14,10 +14,11 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include "GlutenStringUtils.h" -#include #include #include +#include + +#include "GlutenStringUtils.h" namespace local_engine { @@ -27,16 +28,46 @@ PartitionValues GlutenStringUtils::parsePartitionTablePath(const std::string & f Poco::StringTokenizer path(file, "/"); for (const auto & item : path) { - auto position = item.find('='); - if (position != std::string::npos) + auto pos = item.find('='); + if (pos != std::string::npos) { - result.emplace_back(PartitionValue(boost::algorithm::to_lower_copy(item.substr(0, position)), item.substr(position + 1))); + auto key = boost::to_lower_copy(item.substr(0, pos)); + auto value = item.substr(pos + 1); + + std::string unescaped_key; + std::string unescaped_value; + Poco::URI::decode(key, unescaped_key); + Poco::URI::decode(value, unescaped_value); + result.emplace_back(std::move(unescaped_key), std::move(unescaped_value)); } } return result; } + bool GlutenStringUtils::isNullPartitionValue(const std::string & value) { return value == "__HIVE_DEFAULT_PARTITION__"; } + +std::string GlutenStringUtils::dumpPartitionValue(const PartitionValue & value) +{ + return value.first + "=" + value.second; +} + +std::string GlutenStringUtils::dumpPartitionValues(const PartitionValues & values) +{ + std::string res; + res += "["; + + for (size_t i = 0; i < values.size(); ++i) + { + if (i) + res += ", "; + res += dumpPartitionValue(values[i]); + } + + res += "]"; + return res; +} + } diff --git a/cpp-ch/local-engine/Common/GlutenStringUtils.h b/cpp-ch/local-engine/Common/GlutenStringUtils.h index 023cb2b8d047..dd044135320f 100644 --- a/cpp-ch/local-engine/Common/GlutenStringUtils.h +++ b/cpp-ch/local-engine/Common/GlutenStringUtils.h @@ -28,5 +28,8 @@ class GlutenStringUtils public: static PartitionValues parsePartitionTablePath(const std::string & file); static bool isNullPartitionValue(const std::string & value); + + static std::string dumpPartitionValue(const PartitionValue & value); + static std::string dumpPartitionValues(const PartitionValues & values); }; } diff --git a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp index e449ede988ee..4499a9a559a1 100644 --- a/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp +++ b/cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp @@ -55,17 +55,13 @@ FormatFile::FormatFile( : context(context_), file_info(file_info_), read_buffer_builder(read_buffer_builder_) { PartitionValues part_vals = GlutenStringUtils::parsePartitionTablePath(file_info.uri_file()); - String partition_values_str = "["; for (size_t i = 0; i < part_vals.size(); ++i) { const auto & part = part_vals[i]; partition_keys.push_back(part.first); partition_values[part.first] = part.second; - if (i > 0) - partition_values_str += ", "; - partition_values_str += part.first + "=" + part.second; } - partition_values_str += "]"; + LOG_INFO( &Poco::Logger::get("FormatFile"), "Reading File path: {}, format: {}, range: {}, partition_index: {}, partition_values: {}", @@ -73,7 +69,7 @@ FormatFile::FormatFile( file_info.file_format_case(), std::to_string(file_info.start()) + "-" + std::to_string(file_info.start() + file_info.length()), file_info.partition_index(), - partition_values_str); + GlutenStringUtils::dumpPartitionValues(part_vals)); } FormatFilePtr FormatFileUtil::createFile(