Skip to content

Commit

Permalink
Use the apache hive logic to fetch escape delimiter in Hive SerDeOpti…
Browse files Browse the repository at this point in the history
…ons (#10919)

Summary:
Pull Request resolved: #10919

Use the apache hive logic to fetch escape delimiter.

https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108

Reviewed By: Yuhta

Differential Revision: D62139559

fbshipit-source-id: 757b0d24cf2c7bec56c8e12b7d902f6b7159a6f6
  • Loading branch information
Sergey Pershin authored and facebook-github-bot committed Sep 5, 2024
1 parent 94dcf02 commit 40c1015
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 8 deletions.
19 changes: 14 additions & 5 deletions velox/connectors/hive/HiveConnectorUtil.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -489,11 +489,20 @@ std::unique_ptr<dwio::common::SerDeOptions> parseSerdeParameters(
mapKeyDelim = parseDelimiter(mapKeyIt->second);
}

uint8_t escapeChar;
bool hasEscapeChar = false;
if (escapeCharIt != serdeParameters.end() && !escapeCharIt->second.empty()) {
hasEscapeChar = true;
escapeChar = escapeCharIt->second[0];
// If escape character is specified then we use it, unless it is empty - in
// which case we default to '\\'.
// If escape character is not specified (not in the map) we turn escaping off.
// Logic is based on apache hive java code:
// https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108
uint8_t escapeChar = '\\';
const bool hasEscapeChar = (escapeCharIt != serdeParameters.end());
if (hasEscapeChar) {
if (!escapeCharIt->second.empty()) {
// If delim is convertible to uint8_t then we use it as character code,
// otherwise we use the 1st character of the string.
escapeChar = folly::tryTo<uint8_t>(escapeCharIt->second)
.value_or(escapeCharIt->second[0]);
}
}

auto serDeOptions = hasEscapeChar
Expand Down
38 changes: 35 additions & 3 deletions velox/connectors/hive/tests/HiveConnectorUtilTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,38 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) {
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Empty escape delim means default escape char.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "";
expectedSerDe.escapeChar = '\\';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Convertible to byte escape char - use it.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "38";
expectedSerDe.escapeChar = '&';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Overflow byte escape char - fall back to the 1st character of the string.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "381";
expectedSerDe.escapeChar = '3';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Not convertible string - fall back to the 1st character of the string.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kEscapeChar] = "7!";
expectedSerDe.escapeChar = '7';
expectedSerDe.isEscaped = true;
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Modify all previous together.
clearDynamicParameters(FileFormat::TEXT);
serdeParameters[SerDeOptions::kFieldDelim] = '~';
Expand All @@ -167,13 +199,13 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) {
expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '$';
serdeParameters[SerDeOptions::kMapKeyDelim] = '*';
expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '*';
serdeParameters[SerDeOptions::kEscapeChar] = '*';
expectedSerDe.escapeChar = '*';
expectedSerDe.isEscaped = true;
tableParameters[TableParameter::kSerializationNullFormat] = "";
expectedSerDe.nullString = "";
performConfigure();
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));
EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe));

// Tests other custom reader options.
clearDynamicParameters(FileFormat::TEXT);
Expand Down

0 comments on commit 40c1015

Please sign in to comment.