From 40c10151032a5c397a7ca416c7739d5338879070 Mon Sep 17 00:00:00 2001 From: Sergey Pershin Date: Thu, 5 Sep 2024 00:13:20 -0700 Subject: [PATCH] Use the apache hive logic to fetch escape delimiter in Hive SerDeOptions (#10919) Summary: Pull Request resolved: https://github.com/facebookincubator/velox/pull/10919 Use the apache hive logic to fetch escape delimiter. https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108 Reviewed By: Yuhta Differential Revision: D62139559 fbshipit-source-id: 757b0d24cf2c7bec56c8e12b7d902f6b7159a6f6 --- velox/connectors/hive/HiveConnectorUtil.cpp | 19 +++++++--- .../hive/tests/HiveConnectorUtilTest.cpp | 38 +++++++++++++++++-- 2 files changed, 49 insertions(+), 8 deletions(-) diff --git a/velox/connectors/hive/HiveConnectorUtil.cpp b/velox/connectors/hive/HiveConnectorUtil.cpp index 7b94217fa2ff..7dbfb9d05ffb 100644 --- a/velox/connectors/hive/HiveConnectorUtil.cpp +++ b/velox/connectors/hive/HiveConnectorUtil.cpp @@ -489,11 +489,20 @@ std::unique_ptr parseSerdeParameters( mapKeyDelim = parseDelimiter(mapKeyIt->second); } - uint8_t escapeChar; - bool hasEscapeChar = false; - if (escapeCharIt != serdeParameters.end() && !escapeCharIt->second.empty()) { - hasEscapeChar = true; - escapeChar = escapeCharIt->second[0]; + // If escape character is specified then we use it, unless it is empty - in + // which case we default to '\\'. + // If escape character is not specified (not in the map) we turn escaping off. + // Logic is based on apache hive java code: + // https://github.com/apache/hive/blob/3f6f940af3f60cc28834268e5d7f5612e3b13c30/serde/src/java/org/apache/hadoop/hive/serde2/lazy/LazySerDeParameters.java#L105-L108 + uint8_t escapeChar = '\\'; + const bool hasEscapeChar = (escapeCharIt != serdeParameters.end()); + if (hasEscapeChar) { + if (!escapeCharIt->second.empty()) { + // If delim is convertible to uint8_t then we use it as character code, + // otherwise we use the 1st character of the string. + escapeChar = folly::tryTo(escapeCharIt->second) + .value_or(escapeCharIt->second[0]); + } } auto serDeOptions = hasEscapeChar diff --git a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp index 1ca8ae88be78..1415b1493cb0 100644 --- a/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp +++ b/velox/connectors/hive/tests/HiveConnectorUtilTest.cpp @@ -159,6 +159,38 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) { performConfigure(); EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + // Empty escape delim means default escape char. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = ""; + expectedSerDe.escapeChar = '\\'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Convertible to byte escape char - use it. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "38"; + expectedSerDe.escapeChar = '&'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Overflow byte escape char - fall back to the 1st character of the string. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "381"; + expectedSerDe.escapeChar = '3'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + + // Not convertible string - fall back to the 1st character of the string. + clearDynamicParameters(FileFormat::TEXT); + serdeParameters[SerDeOptions::kEscapeChar] = "7!"; + expectedSerDe.escapeChar = '7'; + expectedSerDe.isEscaped = true; + performConfigure(); + EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); + // Modify all previous together. clearDynamicParameters(FileFormat::TEXT); serdeParameters[SerDeOptions::kFieldDelim] = '~'; @@ -167,13 +199,13 @@ TEST_F(HiveConnectorUtilTest, configureReaderOptions) { expectedSerDe.separators[size_t(SerDeSeparator::COLLECTION_DELIM)] = '$'; serdeParameters[SerDeOptions::kMapKeyDelim] = '*'; expectedSerDe.separators[size_t(SerDeSeparator::MAP_KEY_DELIM)] = '*'; + serdeParameters[SerDeOptions::kEscapeChar] = '*'; + expectedSerDe.escapeChar = '*'; + expectedSerDe.isEscaped = true; tableParameters[TableParameter::kSerializationNullFormat] = ""; expectedSerDe.nullString = ""; performConfigure(); EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); - EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); - EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); - EXPECT_TRUE(compareSerDeOptions(readerOptions.serDeOptions(), expectedSerDe)); // Tests other custom reader options. clearDynamicParameters(FileFormat::TEXT);