Skip to content

Commit

Permalink
[GLUTEN-6879][CH] Fix partition value diff when it contains blank spa…
Browse files Browse the repository at this point in the history
…ces (#6880)

* fix partition values diff

* change as request

* change as request
  • Loading branch information
taiyang-li authored Aug 16, 2024
1 parent ccf66a3 commit fc9d273
Show file tree
Hide file tree
Showing 4 changed files with 67 additions and 11 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1314,4 +1314,30 @@ class GlutenClickHouseHiveTableSuite
compareResultsAgainstVanillaSpark(select_sql, true, _ => {})
spark.sql("drop table test_tbl_6506")
}

test("GLUTEN-6879: Fix partition value diff when it contains blanks") {
val tableName = "test_tbl_6879"
sql(s"drop table if exists $tableName")

val createSql =
s"""
|CREATE TABLE $tableName (
| id INT,
| name STRING
|) PARTITIONED BY (part STRING)
|STORED AS PARQUET;
|""".stripMargin
sql(createSql)

val insertSql =
s"""
|INSERT INTO $tableName PARTITION (part='part with spaces')
|VALUES (1, 'John Doe');
|""".stripMargin
sql(insertSql)

val selectSql = s"SELECT * FROM $tableName"
compareResultsAgainstVanillaSpark(selectSql, true, _ => {})
sql(s"drop table if exists $tableName")
}
}
41 changes: 36 additions & 5 deletions cpp-ch/local-engine/Common/GlutenStringUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,11 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "GlutenStringUtils.h"
#include <filesystem>
#include <boost/algorithm/string.hpp>
#include <Poco/StringTokenizer.h>
#include <Poco/URI.h>

#include "GlutenStringUtils.h"

namespace local_engine
{
Expand All @@ -27,16 +28,46 @@ PartitionValues GlutenStringUtils::parsePartitionTablePath(const std::string & f
Poco::StringTokenizer path(file, "/");
for (const auto & item : path)
{
auto position = item.find('=');
if (position != std::string::npos)
auto pos = item.find('=');
if (pos != std::string::npos)
{
result.emplace_back(PartitionValue(boost::algorithm::to_lower_copy(item.substr(0, position)), item.substr(position + 1)));
auto key = boost::to_lower_copy(item.substr(0, pos));
auto value = item.substr(pos + 1);

std::string unescaped_key;
std::string unescaped_value;
Poco::URI::decode(key, unescaped_key);
Poco::URI::decode(value, unescaped_value);
result.emplace_back(std::move(unescaped_key), std::move(unescaped_value));
}
}
return result;
}

bool GlutenStringUtils::isNullPartitionValue(const std::string & value)
{
return value == "__HIVE_DEFAULT_PARTITION__";
}

std::string GlutenStringUtils::dumpPartitionValue(const PartitionValue & value)
{
return value.first + "=" + value.second;
}

std::string GlutenStringUtils::dumpPartitionValues(const PartitionValues & values)
{
std::string res;
res += "[";

for (size_t i = 0; i < values.size(); ++i)
{
if (i)
res += ", ";
res += dumpPartitionValue(values[i]);
}

res += "]";
return res;
}

}
3 changes: 3 additions & 0 deletions cpp-ch/local-engine/Common/GlutenStringUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,5 +28,8 @@ class GlutenStringUtils
public:
static PartitionValues parsePartitionTablePath(const std::string & file);
static bool isNullPartitionValue(const std::string & value);

static std::string dumpPartitionValue(const PartitionValue & value);
static std::string dumpPartitionValues(const PartitionValues & values);
};
}
8 changes: 2 additions & 6 deletions cpp-ch/local-engine/Storages/SubstraitSource/FormatFile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,25 +55,21 @@ FormatFile::FormatFile(
: context(context_), file_info(file_info_), read_buffer_builder(read_buffer_builder_)
{
PartitionValues part_vals = GlutenStringUtils::parsePartitionTablePath(file_info.uri_file());
String partition_values_str = "[";
for (size_t i = 0; i < part_vals.size(); ++i)
{
const auto & part = part_vals[i];
partition_keys.push_back(part.first);
partition_values[part.first] = part.second;
if (i > 0)
partition_values_str += ", ";
partition_values_str += part.first + "=" + part.second;
}
partition_values_str += "]";

LOG_INFO(
&Poco::Logger::get("FormatFile"),
"Reading File path: {}, format: {}, range: {}, partition_index: {}, partition_values: {}",
file_info.uri_file(),
file_info.file_format_case(),
std::to_string(file_info.start()) + "-" + std::to_string(file_info.start() + file_info.length()),
file_info.partition_index(),
partition_values_str);
GlutenStringUtils::dumpPartitionValues(part_vals));
}

FormatFilePtr FormatFileUtil::createFile(
Expand Down

0 comments on commit fc9d273

Please sign in to comment.