facebookincubator · gaoyangxiaozhu · Jun 20, 2024 · Jun 20, 2024 · Jun 25, 2024 · Jun 27, 2024
diff --git a/velox/docs/functions/spark/string.rst b/velox/docs/functions/spark/string.rst
@@ -130,6 +130,29 @@ Unless specified otherwise, all functions return NULL if at least one of the arg
 
         SELECT ltrim('ps', 'spark'); -- "ark"
 
+.. spark:function:: mask(string[, upperChar, lowerChar, digitChar, otherChar]) -> string
+
+    Returns a masked version of the input ``string``.
+    ``string``: string value to mask.
+    ``upperChar``: A single character STRING used to substitute upper case characters. The default is 'X'. If NULL, upper case characters remain unmasked.
+    ``lowerChar``: A single character STRING used to substitute lower case characters. The default is 'x'. If NULL, lower case characters remain unmasked.
+    ``digitChar``: A single character STRING used to substitute digits. The default is 'n'. If NULL, digits remain unmasked.
+    ``otherChar``: A single character STRING used to substitute any other character. The default is NULL, which leaves these characters unmasked. ::
+
+        SELECT mask('abcd-EFGH-8765-4321');  -- "xxxx-XXXX-nnnn-nnnn"
+        SELECT mask('abcd-EFGH-8765-4321', 'Q');  -- "xxxx-QQQQ-nnnn-nnnn"
+        SELECT mask('AbCD123-@$#');  -- "XxXXnnn-@$#"
+        SELECT mask('AbCD123-@$#', 'Q');  -- "QxQQnnn-@$#"
+        SELECT mask('AbCD123-@$#', 'Q', 'q');  -- "QqQQnnn-@$#"
+        SELECT mask('AbCD123-@$#', 'Q', 'q', 'd');  -- "QqQQddd-@$#"
+        SELECT mask('AbCD123-@$#', 'Q', 'q', 'd', 'o');  -- "QqQQdddoooo"
+        SELECT mask('AbCD123-@$#', NULL, 'q', 'd', 'o'); -- "AqCDdddoooo"
+        SELECT mask('AbCD123-@$#', NULL, NULL, 'd', 'o'); -- "AbCDdddoooo"
+        SELECT mask('AbCD123-@$#', NULL, NULL, NULL, 'o'); -- "AbCD123oooo"
+        SELECT mask(NULL, NULL, NULL, NULL, 'o'); -- NULL
+        SELECT mask(NULL); -- NULL
+        SELECT mask('AbCD123-@$#', NULL, NULL, NULL, NULL); -- "AbCD123-@$#"
+
 .. spark:function:: overlay(input, replace, pos, len) -> same as input
 
     Replace a substring of ``input`` starting at ``pos`` character with ``replace`` and
@@ -334,4 +357,4 @@ Unless specified otherwise, all functions return NULL if at least one of the arg
 
     Returns string with all characters changed to uppercase. ::
 
-        SELECT upper('SparkSql'); -- SPARKSQL
+        SELECT upper('SparkSql'); -- SPARKSQL
diff --git a/velox/functions/sparksql/Register.cpp b/velox/functions/sparksql/Register.cpp
@@ -477,6 +477,21 @@ void registerFunctions(const std::string& prefix) {
       int32_t>({prefix + "levenshtein"});
   registerFunction<LevenshteinDistanceFunction, int32_t, Varchar, Varchar>(
       {prefix + "levenshtein"});
+
+  registerFunction<MaskFunction, Varchar, Varchar>({prefix + "mask"});
+  registerFunction<MaskFunction, Varchar, Varchar, Varchar>({prefix + "mask"});
+  registerFunction<MaskFunction, Varchar, Varchar, Varchar, Varchar>(
+      {prefix + "mask"});
+  registerFunction<MaskFunction, Varchar, Varchar, Varchar, Varchar, Varchar>(
+      {prefix + "mask"});
+  registerFunction<
+      MaskFunction,
+      Varchar,
+      Varchar,
+      Varchar,
+      Varchar,
+      Varchar,
+      Varchar>({prefix + "mask"});
 }
 
 } // namespace sparksql

diff --git a/velox/functions/sparksql/String.h b/velox/functions/sparksql/String.h
@@ -24,6 +24,7 @@
 #include "velox/functions/UDFOutputString.h"
 #include "velox/functions/lib/string/StringCore.h"
 #include "velox/functions/lib/string/StringImpl.h"
+#include "velox/functions/prestosql/Utf8Utils.h"
 
 namespace facebook::velox::functions::sparksql {
 
@@ -1431,4 +1432,213 @@ struct LevenshteinDistanceFunction {
   }
 };
 
+// mask(string) -> string
+// mask(string, upperChar) -> string
+// mask(string, upperChar, lowerChar) -> string
+// mask(string, upperChar, lowerChar, digitChar) -> string
+// mask(string, upperChar, lowerChar, digitChar, otherChar) -> string
+//
+// Masks the characters of the given string value with the provided specific
+// characters respectively. Upper-case characters are replaced with the second
+// argument. Default value is 'X'. Lower-case characters are replaced with the
+// third argument. Default value is 'x'. Digit characters are replaced with the
+// fourth argument. Default value is 'n'. Other characters are replaced with the
+// last argument. Default value is NULL and the original character is retained.
+// If the provided nth argument is NULL, the related original character is
+// retained.
+template <typename T>
+struct MaskFunction {
+  VELOX_DEFINE_FUNCTION_TYPES(T);
+
+  FOLLY_ALWAYS_INLINE bool callNullable(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>* inputPtr) {
+    if (inputPtr == nullptr) {
+      return false;
+    }
+
+    doCall(
+        result,
+        *inputPtr,
+        StringView{kMaskedUpperCase_},
+        StringView{kMaskedLowerCase_},
+        StringView{kMaskedDigit_},
+        std::nullopt);
+    return true;
+  }
+
+  FOLLY_ALWAYS_INLINE bool callNullable(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>* inputPtr,
+      const arg_type<Varchar>* upperCharPtr) {
+    if (inputPtr == nullptr) {
+      return false;
+    }
+
+    doCall(
+        result,
+        *inputPtr,
+        getMaskedChar(upperCharPtr),
+        StringView{kMaskedLowerCase_},
+        StringView{kMaskedDigit_},
+        std::nullopt);
+    return true;
+  }
+
+  FOLLY_ALWAYS_INLINE bool callNullable(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>* inputPtr,
+      const arg_type<Varchar>* upperCharPtr,
+      const arg_type<Varchar>* lowerCharPtr) {
+    if (inputPtr == nullptr) {
+      return false;
+    }
+
+    doCall(
+        result,
+        *inputPtr,
+        getMaskedChar(upperCharPtr),
+        getMaskedChar(lowerCharPtr),
+        StringView{kMaskedDigit_},
+        std::nullopt);
+    return true;
+  }
+
+  FOLLY_ALWAYS_INLINE bool callNullable(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>* inputPtr,
+      const arg_type<Varchar>* upperCharPtr,
+      const arg_type<Varchar>* lowerCharPtr,
+      const arg_type<Varchar>* digitCharPtr) {
+    if (inputPtr == nullptr) {
+      return false;
+    }
+
+    doCall(
+        result,
+        *inputPtr,
+        getMaskedChar(upperCharPtr),
+        getMaskedChar(lowerCharPtr),
+        getMaskedChar(digitCharPtr),
+        std::nullopt);
+    return true;
+  }
+
+  FOLLY_ALWAYS_INLINE bool callNullable(
+      out_type<Varchar>& result,
+      const arg_type<Varchar>* inputPtr,
+      const arg_type<Varchar>* upperCharPtr,
+      const arg_type<Varchar>* lowerCharPtr,
+      const arg_type<Varchar>* digitCharPtr,
+      const arg_type<Varchar>* otherCharPtr) {
+    if (inputPtr == nullptr) {
+      return false;
+    }
+
+    doCall(
+        result,
+        *inputPtr,
+        getMaskedChar(upperCharPtr),
+        getMaskedChar(lowerCharPtr),
+        getMaskedChar(digitCharPtr),
+        getMaskedChar(otherCharPtr));
+    return true;
+  }
+
+ private:
+  void doCall(
+      out_type<Varchar>& result,
+      StringView input,
+      const std::optional<StringView> upperChar,
+      const std::optional<StringView> lowerChar,
+      const std::optional<StringView> digitChar,
+      const std::optional<StringView> otherChar) const {
+    auto inputBuffer = input.data();
+    const size_t inputSize = input.size();
+    result.reserve(inputSize);
+    auto outputBuffer = result.data();
+    size_t inputIdx = 0;
+    size_t outputIdx = 0;
+    while (inputIdx < inputSize) {
+      utf8proc_int32_t curCodePoint;
+      int charByteSize;
+      curCodePoint = utf8proc_codepoint(
+          &inputBuffer[inputIdx], inputBuffer + inputSize, charByteSize);
+      if (curCodePoint == -1) {
+        // That means it is a invalid UTF-8 character for example '\xED',
+        // treat it as char with size is 1.
+        charByteSize = 1;
+      }
+      auto maskedChar = &inputBuffer[inputIdx];
+      auto maskedCharByteSize = charByteSize;
+      // Treat invalid UTF-8 character as other char.
+      utf8proc_propval_t category = utf8proc_category(curCodePoint);
+      if (isUpperChar(category) && upperChar.has_value()) {
+        maskedChar = upperChar.value().data();
+        maskedCharByteSize = upperChar.value().size();
+      } else if (isLowerChar(category) && lowerChar.has_value()) {
+        maskedChar = lowerChar.value().data();
+        maskedCharByteSize = lowerChar.value().size();
+      } else if (isDigitChar(category) && digitChar.has_value()) {
+        maskedChar = digitChar.value().data();
+        maskedCharByteSize = digitChar.value().size();
+      } else if (
+          !isUpperChar(category) && !isLowerChar(category) &&
+          !isDigitChar(category) && otherChar.has_value()) {
+        maskedChar = otherChar.value().data();
+        maskedCharByteSize = otherChar.value().size();
+      }
+
+      for (auto i = 0; i < maskedCharByteSize; i++) {
+        outputBuffer[outputIdx++] = maskedChar[i];
+      }
+
+      inputIdx += charByteSize;
+    }
+    result.resize(outputIdx);
+  }
+
+  bool isUpperChar(utf8proc_propval_t& category) const {
+    return category == UTF8PROC_CATEGORY_LU;
+  }
+
+  bool isLowerChar(utf8proc_propval_t& category) const {
+    return category == UTF8PROC_CATEGORY_LL;
+  }
+
+  bool isDigitChar(utf8proc_propval_t& category) const {
+    return category == UTF8PROC_CATEGORY_ND;
+  }
+
+  std::optional<StringView> getMaskedChar(const arg_type<Varchar>* maskChar) {
+    if (maskChar) {
+      auto maskCharData = maskChar->data();
+      auto maskCharSize = maskChar->size();
+      if (maskCharSize == 1) {
+        return StringView{maskCharData};
+      }
+
+      if (maskCharSize == 0) {
+        VELOX_USER_FAIL("Length of replacing char should be 1");
+      }
+
+      // Calculates the byte length of the first unicode character, and compares
+      // it with the length of replacing character. Inequality indicates the
+      // replacing character includes more than one unicode characters.
+      int size;
+      auto codePoint = utf8proc_codepoint(
+          &maskCharData[0], maskCharData + maskCharSize, size);
+      VELOX_USER_CHECK_EQ(
+          maskCharSize, size, "Length of replacing char should be 1");
+
+      return StringView(maskCharData, maskCharSize);
+    }
+    return std::nullopt;
+  }
+
+  static constexpr std::string_view kMaskedUpperCase_{"X"};
+  static constexpr std::string_view kMaskedLowerCase_{"x"};
+  static constexpr std::string_view kMaskedDigit_{"n"};
+};
+
 } // namespace facebook::velox::functions::sparksql