Skip to content

Commit

Permalink
Add word_stem scalar function
Browse files Browse the repository at this point in the history
Add snowball libstemmer v2.2.0 as one of the dependencies.
And use it to implement the word_stem() as a scalar UDF.

When using the libstemmer API, each language creates an sb_stemmer
instance which consumes 114 bytes, including the default 10 bytes for
the output stem.
It uses the realloc to increase the memory block for the output stem if
needed.

Signed-off-by: Yihong Wang <[email protected]>
  • Loading branch information
yhwang committed Apr 25, 2024
1 parent 83f5eed commit 2c14f09
Show file tree
Hide file tree
Showing 10 changed files with 343 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMake/resolve_dependency_modules/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ by Velox. See details on bundling below.
| wangle | v2024.04.01.00 | No |
| mvfst | v2024.04.01.00 | No |
| fbthrift | v2024.04.01.00 | No |
| libstemmer | 2.2.0 | Yes |
| DuckDB (testing) | 0.8.1 | Yes |
| cpr (testing) | 1.10.15 | Yes |

Expand Down
24 changes: 24 additions & 0 deletions CMake/resolve_dependency_modules/libstemmer/Makefile.patch
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@
EXEEXT=.exe
endif
CFLAGS=-O2
-CPPFLAGS=-Iinclude
+CPPFLAGS=-Iinclude -fPIC
all: libstemmer.a stemwords$(EXEEXT)
libstemmer.a: $(snowball_sources:.c=.o)
$(AR) -cru $@ $^
57 changes: 57 additions & 0 deletions CMake/resolve_dependency_modules/stemmer.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Copyright (c) Facebook, Inc. and its affiliates.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
include_guard(GLOBAL)

set(VELOX_STEMMER_VERSION 2.2.0)
set(VELOX_STEMMER_BUILD_SHA256_CHECKSUM
b941d9fe9cf36b4e2f8d3873cd4d8b8775bd94867a1df8d8c001bb8b688377c3)
set(VELOX_STEMMER_SOURCE_URL
"https://snowballstem.org/dist/libstemmer_c-${VELOX_STEMMER_VERSION}.tar.gz"
)

resolve_dependency_url(STEMMER)

message(STATUS "Building stemmer from source")
find_program(MAKE_PROGRAM make REQUIRED)

set(STEMMER_PREFIX "${CMAKE_BINARY_DIR}/_deps/libstemmer")
set(STEMMER_INCLUDE_PATH ${STEMMER_PREFIX}/src/libstemmer/include)

# We can not use FetchContent as libstemmer does not use cmake
ExternalProject_Add(
libstemmer
PREFIX ${STEMMER_PREFIX}
SOURCE_DIR ${STEMMER_PREFIX}/src/libstemmer
URL ${VELOX_STEMMER_SOURCE_URL}
URL_HASH ${VELOX_STEMMER_BUILD_SHA256_CHECKSUM}
BUILD_IN_SOURCE TRUE
CONFIGURE_COMMAND ""
BUILD_COMMAND ${MAKE_PROGRAM}
INSTALL_COMMAND ""
PATCH_COMMAND git apply ${CMAKE_CURRENT_LIST_DIR}/libstemmer/Makefile.patch
BUILD_BYPRODUCTS
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
)

add_library(stemmer STATIC IMPORTED)
add_library(stemmer::stemmer ALIAS stemmer)
file(MAKE_DIRECTORY ${STEMMER_INCLUDE_PATH})
set_target_properties(
stemmer
PROPERTIES
IMPORTED_LOCATION
${STEMMER_PREFIX}/src/libstemmer/${CMAKE_STATIC_LIBRARY_PREFIX}stemmer${CMAKE_STATIC_LIBRARY_SUFFIX}
INTERFACE_INCLUDE_DIRECTORIES ${STEMMER_INCLUDE_PATH})

add_dependencies(stemmer libstemmer)
3 changes: 3 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -541,6 +541,9 @@ endif()
set_source(xsimd)
resolve_dependency(xsimd 10.0.0)

set(stemmer_SOURCE BUNDLED)
resolve_dependency(stemmer)

if(VELOX_BUILD_TESTING)
set(BUILD_TESTING ON)
include(CTest) # include after project() but before add_subdirectory()
Expand Down
37 changes: 37 additions & 0 deletions velox/docs/functions/presto/string.rst
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,43 @@ String Functions

Converts ``string`` to uppercase.

.. function:: word_stem(word) -> varchar

Returns the stem of ``word`` in the English language. If the ``word`` is not an English word,
the ``word`` in lowercase is returned.

.. function:: word_stem(word, lang) -> varchar

Returns the stem of ``word`` in the ``lang`` language. This function supports the following languages:

=========== ================
lang Language
=========== ================
``ca`` ``Catalan``
``da`` ``Danish``
``de`` ``German``
``en`` ``English``
``es`` ``Spanish``
``eu`` ``Basque``
``fi`` ``Finnish``
``fr`` ``French``
``hu`` ``Hungarian``
``hy`` ``Armenian``
``ir`` ``Irish``
``it`` ``Italian``
``lt`` ``Lithuanian``
``nl`` ``Dutch``
``no`` ``Norwegian``
``pt`` ``Portuguese``
``ro`` ``Romanian``
``ru`` ``Russian``
``sv`` ``Swedish``
``tr`` ``Turkish``
=========== ================

If the specified ``lang`` is not supported, this function throws a user error.


Unicode Functions
-----------------

Expand Down
3 changes: 2 additions & 1 deletion velox/functions/prestosql/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ target_link_libraries(
velox_type_tz
velox_presto_types
velox_functions_util
Folly::folly)
Folly::folly
stemmer::stemmer)

set_property(TARGET velox_functions_prestosql_impl PROPERTY JOB_POOL_COMPILE
high_memory_pool)
Expand Down
132 changes: 132 additions & 0 deletions velox/functions/prestosql/WordStem.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <folly/container/F14Map.h>
#include <libstemmer.h>

#include "velox/functions/Udf.h"
#include "velox/functions/lib/string/StringImpl.h"

namespace facebook::velox::functions {

namespace detail {
// Wrap the sbstemmer library and use its sb_stemmer_stem
// to get word stem.
class Stemmer {
public:
Stemmer(sb_stemmer* stemmer) : sbStemmer_(stemmer) {
VELOX_CHECK_NOT_NULL(stemmer);
}

~Stemmer() {
sb_stemmer_delete(sbStemmer_);
}

// Returns the word stem or nullptr if an unlikely out-of-memory error occurs.
const char* stem(const std::string& input) {
return (const char*)(sb_stemmer_stem(
sbStemmer_,
reinterpret_cast<unsigned char const*>(input.c_str()),
input.length()));
}

private:
sb_stemmer* sbStemmer_;
};
} // namespace detail

/// word_stem function
/// word_stem(word) -> varchar
/// return the stem of the word in the English language
/// word_stem(word, lang) -> varchar
/// return the stem of the word in the specificed language
///
/// Use the snowball stemmer library to calculate the stem.
/// https://snowballstem.org
/// The website provides Java implementation which is used in Presto as well
/// as C implementation. Therefore, both Presto and Prestimissio
/// would have the same word stem results.
template <typename TExec>
struct WordStemFunction {
VELOX_DEFINE_FUNCTION_TYPES(TExec);

// ASCII input always produces ASCII result.
static constexpr bool is_default_ascii_behavior = true;

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<false>(result, input);
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input) {
return doCall<true>(result, input);
}

FOLLY_ALWAYS_INLINE void call(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<false>(result, input, lang.str());
}

FOLLY_ALWAYS_INLINE void callAscii(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const arg_type<Varchar>& lang) {
return doCall<true>(result, input, lang.str());
}

template <bool isAscii>
FOLLY_ALWAYS_INLINE void doCall(
out_type<Varchar>& result,
const arg_type<Varchar>& input,
const std::string& lang = "en") {
auto* stemmer = getStemmer(lang);
VELOX_USER_CHECK_NOT_NULL(
stemmer, "Unsupported stemmer language: {}", lang);

std::string lowerOutput;
stringImpl::lower<isAscii>(lowerOutput, input);
auto* stem = stemmer->stem(lowerOutput);
VELOX_CHECK_NOT_NULL(
stem, "Stemmer library returned a NULL (out-of-memory)")
result = stem;
}

private:
folly::F14FastMap<std::string, std::unique_ptr<detail::Stemmer>> stemmers_;

// Get a detail::Stemmer from the the map using the lang as the key or create
// a new one if it doesn't exist. Return nullptr if the specified lang is not
// supported.
detail::Stemmer* getStemmer(const std::string& lang) {
if (auto found = stemmers_.find(lang); found != stemmers_.end()) {
return found->second.get();
}
// Only support ASCII and UTF-8.
if (auto sbStemmer = sb_stemmer_new(lang.c_str(), "UTF_8")) {
auto* stemmer = new detail::Stemmer(sbStemmer);
stemmers_[lang] = std::unique_ptr<detail::Stemmer>(stemmer);
return stemmer;
}
return nullptr;
}
};
} // namespace facebook::velox::functions
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "velox/functions/prestosql/SplitPart.h"
#include "velox/functions/prestosql/SplitToMap.h"
#include "velox/functions/prestosql/StringFunctions.h"
#include "velox/functions/prestosql/WordStem.h"

namespace facebook::velox::functions {

Expand Down Expand Up @@ -127,5 +128,10 @@ void registerStringFunctions(const std::string& prefix) {
{prefix + "strrpos"});
registerFunction<StrRPosFunction, int64_t, Varchar, Varchar, int64_t>(
{prefix + "strrpos"});

// word_stem function
registerFunction<WordStemFunction, Varchar, Varchar>({prefix + "word_stem"});
registerFunction<WordStemFunction, Varchar, Varchar, Varchar>(
{prefix + "word_stem"});
}
} // namespace facebook::velox::functions
1 change: 1 addition & 0 deletions velox/functions/prestosql/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ add_executable(
URLFunctionsTest.cpp
Utf8Test.cpp
WidthBucketArrayTest.cpp
WordStemTest.cpp
ZipTest.cpp
ZipWithTest.cpp)

Expand Down
80 changes: 80 additions & 0 deletions velox/functions/prestosql/tests/WordStemTest.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright (c) Facebook, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <optional>
#include <string>

#include "velox/common/base/tests/GTestUtils.h"
#include "velox/functions/prestosql/tests/utils/FunctionBaseTest.h"

using namespace facebook::velox::functions::test;

namespace facebook::velox::functions {
namespace {
class WordStemTest : public FunctionBaseTest {
protected:
std::string wordStem(const std::string& word, const std::string& lang) {
return evaluateOnce<std::string>(
"word_stem(c0, c1)", std::optional(word), std::optional(lang))
.value();
}

std::string wordStem(const std::string& word) {
return evaluateOnce<std::string>("word_stem(c0)", std::optional(word))
.value();
}
};

/// Borrow test cases from Presto Java:
/// https://github.com/prestodb/presto/blob/master/presto-main/src/test/java/com/facebook/presto/operator/scalar/TestWordStemFunction.java
TEST_F(WordStemTest, asciiWord) {
EXPECT_EQ(wordStem(""), "");
EXPECT_EQ(wordStem("x"), "x");
EXPECT_EQ(wordStem("abc"), "abc");
EXPECT_EQ(wordStem("generally"), "general");
EXPECT_EQ(wordStem("useful"), "use");
EXPECT_EQ(wordStem("runs"), "run");
EXPECT_EQ(wordStem("run"), "run");
EXPECT_EQ(wordStem("authorized", "en"), "author");
EXPECT_EQ(wordStem("accessories", "en"), "accessori");
EXPECT_EQ(wordStem("intensifying", "en"), "intensifi");
EXPECT_EQ(wordStem("resentment", "en"), "resent");
EXPECT_EQ(wordStem("faithfulness", "en"), "faith");
EXPECT_EQ(wordStem("continuerait", "fr"), "continu");
EXPECT_EQ(wordStem("torpedearon", "es"), "torped");
EXPECT_EQ(wordStem("quilomtricos", "pt"), "quilomtr");
EXPECT_EQ(wordStem("pronunziare", "it"), "pronunz");
EXPECT_EQ(wordStem("auferstnde", "de"), "auferstnd");
}

TEST_F(WordStemTest, invalidLang) {
VELOX_ASSERT_THROW(
wordStem("hello", "xx"), "Unsupported stemmer language: xx");
}

TEST_F(WordStemTest, unicodeWord) {
EXPECT_EQ(
wordStem(
"\u004b\u0069\u0074\u0061\u0062\u0131\u006d\u0131\u007a\u0064\u0131",
"tr"),
"kitap");
EXPECT_EQ(
wordStem("\u0432\u0435\u0441\u0435\u043d\u043d\u0438\u0439", "ru"),
"\u0432\u0435\u0441\u0435\u043d");
}

} // namespace
} // namespace facebook::velox::functions

0 comments on commit 2c14f09

Please sign in to comment.